550.40.65

2026-02-02 22:47:25 +00:00 · 2024-06-28 19:18:50 -07:00
parent 3750358633
commit 91726f2e21
194 changed files with 44392 additions and 38511 deletions
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@@ -1448,7 +1448,9 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU.
+//         the CPU. -1 indicates no preference, in which case the pages used
+//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 // Error codes:
 //     NV_ERR_INVALID_ADDRESS:
@@ -1462,6 +1464,11 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //         The VA range exceeds the largest virtual address supported by the
 //         destination processor.
 //
+//     NV_ERR_INVALID_ARGUMENT:
+//         preferredCpuMemoryNode is not a valid CPU NUMA node or it corresponds
+//         to a NUMA node ID for a registered GPU. If NUMA is disabled, it
+//         indicates that preferredCpuMemoryNode was not either 0 or -1.
+//
 //     NV_ERR_INVALID_DEVICE:
 //         destinationUuid does not represent a valid processor such as a CPU or
 //         a GPU with a GPU VA space registered for it. Or destinationUuid is a
@@ -1528,8 +1535,9 @@ NV_STATUS UvmMigrate(void                  *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU. This argument is ignored if the given virtual address range
-//         corresponds to managed memory.
+//         the CPU. -1 indicates no preference, in which case the pages used
+//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 //     semaphoreAddress: (INPUT)
 //         Base address of the semaphore.
@@ -1586,8 +1594,8 @@ NV_STATUS UvmMigrateAsync(void                  *base,
 //
 // Migrates the backing of all virtual address ranges associated with the given
 // range group to the specified destination processor. The behavior of this API
-// is equivalent to calling UvmMigrate on each VA range associated with this
-// range group.
+// is equivalent to calling UvmMigrate with preferredCpuMemoryNode = -1 on each
+// VA range associated with this range group.
 //
 // Any errors encountered during migration are returned immediately. No attempt
 // is made to migrate the remaining unmigrated ranges and the ranges that are
@@ -2169,7 +2177,8 @@ NV_STATUS UvmMapDynamicParallelismRegion(void                  *base,
 //
 // If any page in the VA range has a preferred location, then the migration and
 // mapping policies associated with this API take precedence over those related
-// to the preferred location.
+// to the preferred location. If the preferred location is a specific CPU NUMA
+// node, that NUMA node will be used for a CPU-resident copy of the page.
 //
 // If any pages in this VA range have any processors present in their
 // accessed-by list, the migration and mapping policies associated with this
@@ -2300,7 +2309,7 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 // UvmPreventMigrationRangeGroups has not been called on the range group that
 // those pages are associated with, then the migration and mapping policies
 // associated with UvmEnableReadDuplication override the policies outlined
-// above. Note that enabling read duplication on on any pages in this VA range
+// above. Note that enabling read duplication on any pages in this VA range
 // does not clear the state set by this API for those pages. It merely overrides
 // the policies associated with this state until read duplication is disabled
 // for those pages.
@@ -2333,7 +2342,8 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if preferredLocationUuid is the
 //         UUID of the CPU. -1 is a special value which indicates all CPU nodes
-//         allowed by the global and thread memory policies.
+//         allowed by the global and thread memory policies. If NUMA is disabled
+//         only 0 and -1 are allowed.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@@ -855,6 +855,7 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                      uvm_mem_t *dst_mem,
                                      uvm_mem_t *src_mem,
                                      const UvmCslIv *decrypt_iv,
+                                      NvU32 key_version,
                                      uvm_mem_t *auth_tag_mem,
                                      size_t size,
                                      NvU32 copy_size)
@@ -869,6 +870,7 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@@ -879,6 +881,7 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                          uvm_mem_t *dst_mem,
                                          uvm_mem_t *src_mem,
                                          const UvmCslIv *decrypt_iv,
+                                          NvU32 key_version,
                                          uvm_mem_t *auth_tag_mem,
                                          size_t size,
                                          NvU32 copy_size)
@@ -896,6 +899,7 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@@ -959,7 +963,7 @@ static void gpu_encrypt(uvm_push_t *push,
                                                          i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
                                                          dst_cipher);

-        uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
+        uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);

        if (i > 0)
            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
@@ -1020,6 +1024,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    UvmCslIv *decrypt_iv = NULL;
    UvmCslIv *encrypt_iv = NULL;
+    NvU32 key_version;
    uvm_tracker_t tracker;
    size_t src_plain_size;

@@ -1089,6 +1094,11 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,

    gpu_encrypt(&push, dst_cipher, dst_plain_gpu, auth_tag_mem, decrypt_iv, size, copy_size);

+    // There shouldn't be any key rotation between the end of the push and the
+    // CPU decryption(s), but it is more robust against test changes to force
+    // decryption to use the saved key.
+    key_version = uvm_channel_pool_key_version(push.channel->pool);
+
    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);

    TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher, size), out);
@@ -1101,6 +1111,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                dst_plain,
                                                dst_cipher,
                                                decrypt_iv,
+                                                key_version,
                                                auth_tag_mem,
                                                size,
                                                copy_size),
@@ -1111,6 +1122,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                    dst_plain,
                                                    dst_cipher,
                                                    decrypt_iv,
+                                                    key_version,
                                                    auth_tag_mem,
                                                    size,
                                                    copy_size),
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@@ -228,21 +228,65 @@ typedef struct
    // variant is required when the thread holding the pool lock must sleep
    // (ex: acquire another mutex) deeper in the call stack, either in UVM or
    // RM.
-    union {
+    union
+    {
        uvm_spinlock_t spinlock;
        uvm_mutex_t mutex;
    };

-    // Secure operations require that uvm_push_begin order matches
-    // uvm_push_end order, because the engine's state is used in its internal
-    // operation and each push may modify this state. push_locks is protected by
-    // the channel pool lock.
-    DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
+    struct
+    {
+        // Secure operations require that uvm_push_begin order matches
+        // uvm_push_end order, because the engine's state is used in its
+        // internal operation and each push may modify this state.
+        // push_locks is protected by the channel pool lock.
+        DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);

-    // Counting semaphore for available and unlocked channels, it must be
-    // acquired before submitting work to a channel when the Confidential
-    // Computing feature is enabled.
-    uvm_semaphore_t push_sem;
+        // Counting semaphore for available and unlocked channels, it must be
+        // acquired before submitting work to a channel when the Confidential
+        // Computing feature is enabled.
+        uvm_semaphore_t push_sem;
+
+        // Per channel buffers in unprotected sysmem.
+        uvm_rm_mem_t *pool_sysmem;
+
+        // Per channel buffers in protected vidmem.
+        uvm_rm_mem_t *pool_vidmem;
+
+       struct
+       {
+            // Current encryption key version, incremented upon key rotation.
+            // While there are separate keys for encryption and decryption, the
+            // two keys are rotated at once, so the versioning applies to both.
+            NvU32 version;
+
+            // Lock used to ensure mutual exclusion during key rotation.
+            uvm_mutex_t mutex;
+
+            // CSL contexts passed to RM for key rotation. This is usually an
+            // array containing the CSL contexts associated with the channels in
+            // the pool. In the case of the WLC pool, the array also includes
+            // CSL contexts associated with LCIC channels.
+            UvmCslContext **csl_contexts;
+
+            // Number of elements in the CSL context array.
+            unsigned num_csl_contexts;
+
+            // Number of bytes encrypted, or decrypted, on the engine associated
+            // with the pool since the last key rotation. Only used during
+            // testing, to force key rotations after a certain encryption size,
+            // see UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD.
+            //
+            // Encryptions on a LCIC pool are accounted for in the paired WLC
+            // pool.
+            //
+            // TODO: Bug 4612912: these accounting variables can be removed once
+            // RM exposes an API to set the key rotation lower threshold.
+            atomic64_t encrypted;
+            atomic64_t decrypted;
+        } key_rotation;
+
+    } conf_computing;
 } uvm_channel_pool_t;

 struct uvm_channel_struct
@@ -322,43 +366,14 @@ struct uvm_channel_struct
        // work launches to match the order of push end-s that triggered them.
        volatile NvU32 gpu_put;

-        // Static pushbuffer for channels with static schedule (WLC/LCIC)
-        uvm_rm_mem_t *static_pb_protected_vidmem;
-
-        // Static pushbuffer staging buffer for WLC
-        uvm_rm_mem_t *static_pb_unprotected_sysmem;
-        void *static_pb_unprotected_sysmem_cpu;
-        void *static_pb_unprotected_sysmem_auth_tag_cpu;
-
-        // The above static locations are required by the WLC (and LCIC)
-        // schedule. Protected sysmem location completes WLC's independence
-        // from the pushbuffer allocator.
+        // Protected sysmem location makes WLC independent from the pushbuffer
+        // allocator. Unprotected sysmem and protected vidmem counterparts
+        // are allocated from the channel pool (sysmem, vidmem).
        void *static_pb_protected_sysmem;

-        // Static tracking semaphore notifier values
-        // Because of LCIC's fixed schedule, the secure semaphore release
-        // mechanism uses two additional static locations for incrementing the
-        // notifier values. See:
-        // . channel_semaphore_secure_release()
-        // . setup_lcic_schedule()
-        // . internal_channel_submit_work_wlc()
-        uvm_rm_mem_t *static_notifier_unprotected_sysmem;
-        NvU32 *static_notifier_entry_unprotected_sysmem_cpu;
-        NvU32 *static_notifier_exit_unprotected_sysmem_cpu;
-        uvm_gpu_address_t static_notifier_entry_unprotected_sysmem_gpu_va;
-        uvm_gpu_address_t static_notifier_exit_unprotected_sysmem_gpu_va;
-
-        // Explicit location for push launch tag used by WLC.
-        // Encryption auth tags have to be located in unprotected sysmem.
-        void *launch_auth_tag_cpu;
-        NvU64 launch_auth_tag_gpu_va;
-
        // Used to decrypt the push back to protected sysmem.
        // This happens when profilers register callbacks for migration data.
        uvm_push_crypto_bundle_t *push_crypto_bundles;
-
-        // Accompanying authentication tags for the crypto bundles
-        uvm_rm_mem_t *push_crypto_bundle_auth_tags;
    } conf_computing;

    // RM channel information
@@ -418,7 +433,7 @@ struct uvm_channel_manager_struct
    unsigned num_channel_pools;

    // Mask containing the indexes of the usable Copy Engines. Each usable CE
-    // has at least one pool associated with it.
+    // has at least one pool of type UVM_CHANNEL_POOL_TYPE_CE associated with it
    DECLARE_BITMAP(ce_mask, UVM_COPY_ENGINE_COUNT_MAX);

    struct
@@ -451,6 +466,16 @@ struct uvm_channel_manager_struct
        UVM_BUFFER_LOCATION gpput_loc;
        UVM_BUFFER_LOCATION pushbuffer_loc;
    } conf;
+
+    struct
+    {
+        // Flag indicating that the WLC/LCIC mechanism is ready/setup; should
+        // only be false during (de)initialization.
+        bool wlc_ready;
+
+        // True indicates that key rotation is enabled (UVM-wise).
+        bool key_rotation_enabled;
+    } conf_computing;
 };

 // Create a channel manager for the GPU
@@ -501,6 +526,14 @@ uvm_channel_t *uvm_channel_lcic_get_paired_wlc(uvm_channel_t *lcic_channel);

 uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel);

+NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel);
+
+NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel);
+
+char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel);
+
+char *uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(uvm_channel_t *channel, unsigned tag_index);
+
 static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
 {
    UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
@@ -532,6 +565,17 @@ static uvm_channel_type_t uvm_channel_proxy_channel_type(void)
    return UVM_CHANNEL_TYPE_MEMOPS;
 }

+// Force key rotation in the engine associated with the given channel pool.
+// Rotation may still not happen if RM cannot acquire the necessary locks (in
+// which case the function returns NV_ERR_STATE_IN_USE).
+//
+// This function should be only invoked in pools in which key rotation is
+// enabled.
+NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool);
+
+// Retrieve the current encryption key version associated with the channel pool.
+NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool);
+
 // Privileged channels support all the Host and engine methods, while
 // non-privileged channels don't support privileged methods.
 //
@@ -579,12 +623,9 @@ NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager
 // beginning.
 NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager);

-// Check if WLC/LCIC mechanism is ready/setup
-// Should only return false during initialization
 static bool uvm_channel_manager_is_wlc_ready(uvm_channel_manager_t *manager)
 {
-    return (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL) &&
-           (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] != NULL);
+    return manager->conf_computing.wlc_ready;
 }
 // Get the GPU VA of semaphore_channel's tracking semaphore within the VA space
 // associated with access_channel.
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@@ -796,11 +796,8 @@ done:
 NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
 {
    NV_STATUS status = NV_OK;
-    uvm_channel_pool_t *pool;
-    uvm_push_t *pushes;
-    uvm_gpu_t *gpu;
-    NvU32 i;
-    NvU32 num_pushes;
+    uvm_push_t *pushes = NULL;
+    uvm_gpu_t *gpu = NULL;

    if (!g_uvm_global.conf_computing_enabled)
        return NV_OK;
@@ -810,9 +807,19 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
    for_each_va_space_gpu(gpu, va_space) {
        uvm_channel_type_t channel_type;

+        // Key rotation is disabled because this test relies on nested pushes,
+        // which is illegal. If any push other than the first one triggers key
+        // rotation, the test won't complete. This is because key rotation
+        // depends on waiting for ongoing pushes to end, which doesn't happen
+        // if those pushes are ended after the current one begins.
+        uvm_conf_computing_disable_key_rotation(gpu);
+
        for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) {
-            pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
-            TEST_CHECK_RET(pool != NULL);
+            NvU32 i;
+            NvU32 num_pushes;
+            uvm_channel_pool_t *pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
+
+            TEST_CHECK_GOTO(pool != NULL, error);

            // Skip LCIC channels as those can't accept any pushes
            if (uvm_channel_pool_is_lcic(pool))
@@ -824,7 +831,7 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
            num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES);

            pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes);
-            TEST_CHECK_RET(pushes != NULL);
+            TEST_CHECK_GOTO(pushes != NULL, error);

            for (i = 0; i < num_pushes; i++) {
                uvm_push_t *push = &pushes[i];
@@ -841,12 +848,18 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)

            uvm_kvfree(pushes);
        }
+
+        uvm_conf_computing_enable_key_rotation(gpu);
    }

    uvm_thread_context_lock_enable_tracking();

    return status;
+
 error:
+    if (gpu != NULL)
+        uvm_conf_computing_enable_key_rotation(gpu);
+
    uvm_thread_context_lock_enable_tracking();
    uvm_kvfree(pushes);

@@ -948,6 +961,318 @@ release:
    return NV_OK;
 }

+static NV_STATUS force_key_rotations(uvm_channel_pool_t *pool, unsigned num_rotations)
+{
+    unsigned num_tries;
+    unsigned max_num_tries = 20;
+    unsigned num_rotations_completed = 0;
+
+    if (num_rotations == 0)
+        return NV_OK;
+
+    // The number of accepted rotations is kept low, so failed rotation
+    // invocations due to RM not acquiring the necessary locks (which imply a
+    // sleep in the test) do not balloon the test execution time.
+    UVM_ASSERT(num_rotations <= 10);
+
+    for (num_tries = 0; (num_tries < max_num_tries) && (num_rotations_completed < num_rotations); num_tries++) {
+        // Force key rotation, irrespective of encryption usage.
+        NV_STATUS status = uvm_channel_pool_rotate_key(pool);
+
+        // Key rotation may not be able to complete due to RM failing to acquire
+        // the necessary locks. Detect the situation, sleep for a bit, and then
+        // try again
+        //
+        // The maximum time spent sleeping in a single rotation call is
+        // (max_num_tries * max_sleep_us)
+        if (status == NV_ERR_STATE_IN_USE) {
+            NvU32 min_sleep_us = 1000;
+            NvU32 max_sleep_us = 10000;
+
+            usleep_range(min_sleep_us, max_sleep_us);
+            continue;
+        }
+
+        TEST_NV_CHECK_RET(status);
+
+        num_rotations_completed++;
+    }
+
+    // If not a single key rotation occurred, the dependent tests still pass,
+    // but there is no much value to them. Instead, return an error so the
+    // maximum number of tries, or the maximum sleep time, are adjusted to
+    // ensure that at least one rotation completes.
+    if (num_rotations_completed > 0)
+        return NV_OK;
+    else
+        return NV_ERR_STATE_IN_USE;
+}
+
+static NV_STATUS force_key_rotation(uvm_channel_pool_t *pool)
+{
+    return force_key_rotations(pool, 1);
+}
+
+// Test key rotation in all pools. This is useful because key rotation may not
+// happen otherwise on certain engines during UVM test execution. For example,
+// if the MEMOPS channel type is mapped to a CE not shared with any other
+// channel type, then the only encryption taking place in the engine is due to
+// semaphore releases (4 bytes each). This small encryption size makes it
+// unlikely to exceed even small rotation thresholds.
+static NV_STATUS test_channel_key_rotation_basic(uvm_gpu_t *gpu)
+{
+    uvm_channel_pool_t *pool;
+
+    uvm_for_each_pool(pool, gpu->channel_manager) {
+        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+            continue;
+
+        TEST_NV_CHECK_RET(force_key_rotation(pool));
+    }
+
+    return NV_OK;
+}
+
+// Interleave GPU encryptions and decryptions, and their CPU counterparts, with
+// key rotations.
+static NV_STATUS test_channel_key_rotation_interleave(uvm_gpu_t *gpu)
+{
+    int i;
+    uvm_channel_pool_t *gpu_to_cpu_pool;
+    uvm_channel_pool_t *cpu_to_gpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    void *initial_plain_cpu = NULL;
+    void *final_plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
+
+    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
+
+    initial_plain_cpu = uvm_kvmalloc_zero(size);
+    if (initial_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    final_plain_cpu = uvm_kvmalloc_zero(size);
+    if (final_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    memset(initial_plain_cpu, 1, size);
+
+    for (i = 0; i < 5; i++) {
+        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
+        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
+
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                                      plain_gpu_address,
+                                                                      initial_plain_cpu,
+                                                                      size,
+                                                                      NULL,
+                                                                      "CPU > GPU"),
+                           out);
+
+        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
+        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
+
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
+                                                                      final_plain_cpu,
+                                                                      plain_gpu_address,
+                                                                      size,
+                                                                      NULL,
+                                                                      "GPU > CPU"),
+                           out);
+
+        TEST_CHECK_GOTO(!memcmp(initial_plain_cpu, final_plain_cpu, size), out);
+
+        memset(final_plain_cpu, 0, size);
+    }
+
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(final_plain_cpu);
+    uvm_kvfree(initial_plain_cpu);
+
+    return status;
+}
+
+static NV_STATUS memset_vidmem(uvm_mem_t *mem, NvU8 val)
+{
+    uvm_push_t push;
+    uvm_gpu_address_t gpu_address;
+    uvm_gpu_t *gpu = mem->backing_gpu;
+
+    UVM_ASSERT(uvm_mem_is_vidmem(mem));
+
+    TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "zero vidmem"));
+
+    gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
+    gpu->parent->ce_hal->memset_1(&push, gpu_address, val, mem->size);
+
+    TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
+
+    return NV_OK;
+}
+
+// Custom version of uvm_conf_computing_util_memcopy_gpu_to_cpu that allows
+// testing to insert key rotations in between the push end, and the CPU
+// decryption
+static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                              void *dst_plain,
+                                              uvm_gpu_address_t src_gpu_address,
+                                              size_t size,
+                                              unsigned num_rotations_to_insert)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
+    void *src_cipher, *auth_tag;
+    uvm_channel_t *channel;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Small GPU > CPU encryption");
+    if (status != NV_OK)
+        goto out;
+
+    channel = push.channel;
+    uvm_conf_computing_log_gpu_encryption(channel, size, dma_buffer->decrypt_iv);
+    dma_buffer->key_version[0] = uvm_channel_pool_key_version(channel->pool);
+
+    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+    if (status != NV_OK)
+        goto out;
+
+    TEST_NV_CHECK_GOTO(force_key_rotations(channel->pool, num_rotations_to_insert), out);
+
+    // If num_rotations_to_insert is not zero, the current encryption key will
+    // be different from the one used during CE encryption.
+
+    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    status = uvm_conf_computing_cpu_decrypt(channel,
+                                            dst_plain,
+                                            src_cipher,
+                                            dma_buffer->decrypt_iv,
+                                            dma_buffer->key_version[0],
+                                            size,
+                                            auth_tag);
+
+ out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
+static NV_STATUS test_channel_key_rotation_cpu_decryption(uvm_gpu_t *gpu,
+                                                          unsigned num_repetitions,
+                                                          unsigned num_rotations_to_insert)
+{
+    unsigned i;
+    uvm_channel_pool_t *gpu_to_cpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    NvU8 *plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+        return NV_OK;
+
+    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
+
+    plain_cpu = (NvU8 *) uvm_kvmalloc_zero(size);
+    if (plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    TEST_NV_CHECK_GOTO(memset_vidmem(plain_gpu, 1), out);
+
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    for (i = 0; i < num_repetitions; i++) {
+        unsigned j;
+
+        TEST_NV_CHECK_GOTO(encrypted_memcopy_gpu_to_cpu(gpu,
+                                                        plain_cpu,
+                                                        plain_gpu_address,
+                                                        size,
+                                                        num_rotations_to_insert),
+                          out);
+
+        for (j = 0; j < size; j++)
+            TEST_CHECK_GOTO(plain_cpu[j] == 1, out);
+
+        memset(plain_cpu, 0, size);
+
+    }
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(plain_cpu);
+
+    return status;
+}
+
+// Test that CPU decryptions can use old keys i.e. previous versions of the keys
+// that are no longer the current key, due to key rotation. Given that SEC2
+// does not expose encryption capabilities, the "decrypt-after-rotation" problem
+// is exclusive of CE encryptions.
+static NV_STATUS test_channel_key_rotation_decrypt_after_key_rotation(uvm_gpu_t *gpu)
+{
+    // Instruct encrypted_memcopy_gpu_to_cpu to insert several key rotations
+    // between the GPU encryption, and the associated CPU decryption.
+    unsigned num_rotations_to_insert = 8;
+
+    TEST_NV_CHECK_RET(test_channel_key_rotation_cpu_decryption(gpu, 1, num_rotations_to_insert));
+
+    return NV_OK;
+}
+
+static NV_STATUS test_channel_key_rotation(uvm_va_space_t *va_space)
+{
+    uvm_gpu_t *gpu;
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    for_each_va_space_gpu(gpu, va_space) {
+        if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+            break;
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_basic(gpu));
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_interleave(gpu));
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_decrypt_after_key_rotation(gpu));
+    }
+
+    return NV_OK;
+}
+
 NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;
@@ -1203,6 +1528,10 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct
    if (status != NV_OK)
        goto done;

+    status = test_channel_key_rotation(va_space);
+    if (status != NV_OK)
+        goto done;
+
    // The following tests have side effects, they reset the GPU's
    // channel_manager.
    status = test_channel_pushbuffer_extension_base(va_space);
@@ -1338,6 +1667,126 @@ done:
    return status;
 }

+static NV_STATUS channel_stress_key_rotation_cpu_encryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    int i;
+    uvm_channel_pool_t *cpu_to_gpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    void *initial_plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU);
+
+    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
+
+    initial_plain_cpu = uvm_kvmalloc_zero(size);
+    if (initial_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    memset(initial_plain_cpu, 1, size);
+
+    for (i = 0; i < params->iterations; i++) {
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                                      plain_gpu_address,
+                                                                      initial_plain_cpu,
+                                                                      size,
+                                                                      NULL,
+                                                                      "CPU > GPU"),
+                           out);
+    }
+
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(initial_plain_cpu);
+
+    return status;
+}
+
+static NV_STATUS channel_stress_key_rotation_cpu_decryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    unsigned num_rotations_to_insert = 0;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU);
+
+    return test_channel_key_rotation_cpu_decryption(gpu, params->iterations, num_rotations_to_insert);
+}
+
+static NV_STATUS channel_stress_key_rotation_rotate(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    NvU32 i;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE);
+
+    for (i = 0; i < params->iterations; ++i) {
+        NV_STATUS status;
+        uvm_channel_pool_t *pool;
+        uvm_channel_type_t type;
+
+        if ((i % 3) == 0)
+            type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
+        else if ((i % 3) == 1)
+            type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
+        else
+            type = UVM_CHANNEL_TYPE_WLC;
+
+        pool = gpu->channel_manager->pool_to_use.default_for_type[type];
+
+        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+            return NV_ERR_INVALID_STATE;
+
+        status = force_key_rotation(pool);
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
+
+// The objective of this test is documented in the user-level function
+static NV_STATUS uvm_test_channel_stress_key_rotation(uvm_va_space_t *va_space, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    uvm_test_rng_t rng;
+    uvm_gpu_t *gpu;
+    NV_STATUS status = NV_OK;
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    uvm_test_rng_init(&rng, params->seed);
+
+    uvm_va_space_down_read(va_space);
+
+    // Key rotation should be enabled, or disabled, in all GPUs. Pick a random
+    // one.
+    gpu = random_va_space_gpu(&rng, va_space);
+
+    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+        goto out;
+
+    if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU)
+        status = channel_stress_key_rotation_cpu_encryption(gpu, params);
+    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU)
+        status = channel_stress_key_rotation_cpu_decryption(gpu, params);
+    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE)
+        status = channel_stress_key_rotation_rotate(gpu, params);
+    else
+        status = NV_ERR_INVALID_PARAMETER;
+
+out:
+    uvm_va_space_up_read(va_space);
+
+    return status;
+}
+
 NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@@ -1349,6 +1798,8 @@ NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct
            return uvm_test_channel_stress_update_channels(va_space, params);
        case UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH:
            return uvm_test_channel_noop_push(va_space, params);
+        case UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION:
+            return uvm_test_channel_stress_key_rotation(va_space, params);
        default:
            return NV_ERR_INVALID_PARAMETER;
    }
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@@ -33,6 +33,15 @@
 #include "nv_uvm_interface.h"
 #include "uvm_va_block.h"

+// Amount of encrypted data on a given engine that triggers key rotation. This
+// is a UVM internal threshold, different from that of RM, and used only during
+// testing.
+//
+// Key rotation is triggered when the total encryption size, or the total
+// decryption size (whatever comes first) reaches this lower threshold on the
+// engine.
+#define UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD (UVM_SIZE_1MB * 8)
+
 // The maximum number of secure operations per push is:
 // UVM_MAX_PUSH_SIZE / min(CE encryption size, CE decryption size)
 // + 1 (tracking semaphore) =  128 * 1024 / 56 + 1 = 2342
@@ -352,6 +361,19 @@ error:
    return status;
 }

+// The production key rotation defaults are such that key rotations rarely
+// happen. During UVM testing more frequent rotations are triggering by relying
+// on internal encryption usage accounting. When key rotations are triggered by
+// UVM, the driver does not rely on channel key rotation notifiers.
+//
+// TODO: Bug 4612912: UVM should be able to programmatically set the rotation
+// lower threshold. This function, and all the metadata associated with it
+// (per-pool encryption accounting, for example) can be removed at that point.
+static bool key_rotation_is_notifier_driven(void)
+{
+    return !uvm_enable_builtin_tests;
+}
+
 NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu)
 {
    NV_STATUS status;
@@ -394,17 +416,35 @@ void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu)
    conf_computing_dma_buffer_pool_deinit(&gpu->conf_computing.dma_buffer_pool);
 }

-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv)
 {
    NV_STATUS status;
+    uvm_channel_pool_t *pool;
+
+    if (uvm_channel_is_lcic(channel))
+        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
+    else
+        pool = channel->pool;

    uvm_mutex_lock(&channel->csl.ctx_lock);
+
+    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
+        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, size);
+
+        // Informing RM of an encryption/decryption should not fail
+        UVM_ASSERT(status == NV_OK);
+
+        if (!key_rotation_is_notifier_driven())
+            atomic64_add(size, &pool->conf_computing.key_rotation.encrypted);
+    }
+
    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, 1, iv);
-    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
+
+    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv)
@@ -428,27 +468,46 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
                                    void *auth_tag_buffer)
 {
    NV_STATUS status;
+    uvm_channel_pool_t *pool;

    UVM_ASSERT(size);

+    if (uvm_channel_is_lcic(channel))
+        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
+    else
+        pool = channel->pool;
+
    uvm_mutex_lock(&channel->csl.ctx_lock);
+
    status = nvUvmInterfaceCslEncrypt(&channel->csl.ctx,
                                      size,
                                      (NvU8 const *) src_plain,
                                      encrypt_iv,
                                      (NvU8 *) dst_cipher,
                                      (NvU8 *) auth_tag_buffer);
-    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
+
+    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
+        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_ENCRYPT, size);
+
+        // Informing RM of an encryption/decryption should not fail
+        UVM_ASSERT(status == NV_OK);
+
+        if (!key_rotation_is_notifier_driven())
+            atomic64_add(size, &pool->conf_computing.key_rotation.decrypted);
+    }
+
+    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
+                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer)
 {
@@ -469,10 +528,19 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                      size,
                                      (const NvU8 *) src_cipher,
                                      src_iv,
+                                      key_version,
                                      (NvU8 *) dst_plain,
                                      NULL,
                                      0,
                                      (const NvU8 *) auth_tag_buffer);
+
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, channel %s, GPU %s\n",
+                      nvstatusToString(status),
+                      channel->name,
+                      uvm_gpu_name(uvm_channel_get_gpu(channel)));
+    }
+
    uvm_mutex_unlock(&channel->csl.ctx_lock);

    return status;
@@ -485,6 +553,8 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid)
 {
    NV_STATUS status;
+    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
+    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // There is no dedicated lock for the CSL context associated with replayable
    // faults. The mutual exclusion required by the RM CSL API is enforced by
@@ -494,36 +564,48 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslDecrypt(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
-                                      parent_gpu->fault_buffer_hal->entry_size(parent_gpu),
+    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
+
+    // Informing RM of an encryption/decryption should not fail
+    UVM_ASSERT(status == NV_OK);
+
+    status = nvUvmInterfaceCslDecrypt(csl_context,
+                                      fault_entry_size,
                                      (const NvU8 *) src_cipher,
                                      NULL,
+                                      NV_U32_MAX,
                                      (NvU8 *) dst_plain,
                                      &valid,
                                      sizeof(valid),
                                      (const NvU8 *) auth_tag_buffer);

-    if (status != NV_OK)
+    if (status != NV_OK) {
        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, GPU %s\n",
                      nvstatusToString(status),
                      uvm_parent_gpu_name(parent_gpu));

+    }
+
    return status;
 }

-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment)
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status;
+    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
+    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // See comment in uvm_conf_computing_fault_decrypt
    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslIncrementIv(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
-                                          UVM_CSL_OPERATION_DECRYPT,
-                                          increment,
-                                          NULL);
+    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
+
+    // Informing RM of an encryption/decryption should not fail
+    UVM_ASSERT(status == NV_OK);
+
+    status = nvUvmInterfaceCslIncrementIv(csl_context, UVM_CSL_OPERATION_DECRYPT, 1, NULL);

    UVM_ASSERT(status == NV_OK);
 }
@@ -625,3 +707,231 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 {
    return uvm_conf_computing_rotate_channel_ivs_below_limit(channel, uvm_conf_computing_channel_iv_rotation_limit, true);
 }
+
+void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu)
+{
+    if (!g_uvm_global.conf_computing_enabled)
+        return;
+
+    // Key rotation cannot be enabled on UVM if it is disabled on RM
+    if (!gpu->parent->rm_info.gpuConfComputeCaps.bKeyRotationEnabled)
+        return;
+
+    gpu->channel_manager->conf_computing.key_rotation_enabled = true;
+}
+
+void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu)
+{
+    if (!g_uvm_global.conf_computing_enabled)
+        return;
+
+    gpu->channel_manager->conf_computing.key_rotation_enabled = false;
+}
+
+bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu)
+{
+    return gpu->channel_manager->conf_computing.key_rotation_enabled;
+}
+
+bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool)
+{
+    if (!uvm_conf_computing_is_key_rotation_enabled(pool->manager->gpu))
+        return false;
+
+    // TODO: Bug 4586447: key rotation must be disabled in the SEC2 engine,
+    // because currently the encryption key is shared between UVM and RM, but
+    // UVM is not able to idle SEC2 channels owned by RM.
+    if (uvm_channel_pool_is_sec2(pool))
+        return false;
+
+    // Key rotation happens as part of channel reservation, and LCIC channels
+    // are never reserved directly. Rotation of keys in LCIC channels happens
+    // as the result of key rotation in WLC channels.
+    //
+    // Return false even if there is nothing fundamental prohibiting direct key
+    // rotation on LCIC pools
+    if (uvm_channel_pool_is_lcic(pool))
+        return false;
+
+    return true;
+}
+
+static bool conf_computing_is_key_rotation_pending_use_stats(uvm_channel_pool_t *pool)
+{
+    NvU64 decrypted, encrypted;
+
+    UVM_ASSERT(!key_rotation_is_notifier_driven());
+
+    decrypted = atomic64_read(&pool->conf_computing.key_rotation.decrypted);
+
+    if (decrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
+        return true;
+
+    encrypted = atomic64_read(&pool->conf_computing.key_rotation.encrypted);
+
+    if (encrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
+        return true;
+
+    return false;
+}
+
+static bool conf_computing_is_key_rotation_pending_use_notifier(uvm_channel_pool_t *pool)
+{
+    // If key rotation is pending for the pool's engine, then the key rotation
+    // notifier in any of the engine channels can be used by UVM to detect the
+    // situation. Note that RM doesn't update all the notifiers in a single
+    // atomic operation, so it is possible that the channel read by UVM (the
+    // first one in the pool) indicates that a key rotation is pending, but
+    // another channel in the pool (temporarily) indicates the opposite, or vice
+    // versa.
+    uvm_channel_t *first_channel = pool->channels;
+
+    UVM_ASSERT(key_rotation_is_notifier_driven());
+    UVM_ASSERT(first_channel != NULL);
+
+    return first_channel->channel_info.keyRotationNotifier->status == UVM_KEY_ROTATION_STATUS_PENDING;
+}
+
+bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool)
+{
+    if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+        return false;
+
+    if (key_rotation_is_notifier_driven())
+        return conf_computing_is_key_rotation_pending_use_notifier(pool);
+    else
+        return conf_computing_is_key_rotation_pending_use_stats(pool);
+}
+
+NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool)
+{
+    NV_STATUS status;
+
+    UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
+    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
+    UVM_ASSERT(pool->conf_computing.key_rotation.num_csl_contexts > 0);
+
+    // NV_ERR_STATE_IN_USE indicates that RM was not able to acquire the
+    // required locks at this time. This status is not interpreted as an error,
+    // but as a sign for UVM to try again later. This is the same "protocol"
+    // used in IV rotation.
+    status = nvUvmInterfaceCslRotateKey(pool->conf_computing.key_rotation.csl_contexts,
+                                        pool->conf_computing.key_rotation.num_csl_contexts);
+
+    if (status == NV_OK) {
+        pool->conf_computing.key_rotation.version++;
+
+        if (!key_rotation_is_notifier_driven()) {
+            atomic64_set(&pool->conf_computing.key_rotation.decrypted, 0);
+            atomic64_set(&pool->conf_computing.key_rotation.encrypted, 0);
+        }
+    }
+    else if (status != NV_ERR_STATE_IN_USE) {
+        UVM_DBG_PRINT("nvUvmInterfaceCslRotateKey() failed in engine %u: %s\n",
+                      pool->engine_index,
+                      nvstatusToString(status));
+    }
+
+    return status;
+}
+
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
+                                                     uvm_gpu_address_t dst_gpu_address,
+                                                     void *src_plain,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
+    void *dst_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
+
+    src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+
+out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                                     void *dst_plain,
+                                                     uvm_gpu_address_t src_gpu_address,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
+    void *src_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    uvm_conf_computing_log_gpu_encryption(push.channel, size, dma_buffer->decrypt_iv);
+    dma_buffer->key_version[0] = uvm_channel_pool_key_version(push.channel->pool);
+
+    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+    if (status != NV_OK)
+        goto out;
+
+    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    status = uvm_conf_computing_cpu_decrypt(push.channel,
+                                            dst_plain,
+                                            src_cipher,
+                                            dma_buffer->decrypt_iv,
+                                            dma_buffer->key_version[0],
+                                            size,
+                                            auth_tag);
+
+ out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@@ -87,9 +87,9 @@ typedef struct
    // a free buffer.
    uvm_tracker_t tracker;

-    // When the DMA buffer is used as the destination of a GPU encryption, SEC2
-    // writes the authentication tag here. Later when the buffer is decrypted
-    // on the CPU the authentication tag is used again (read) for CSL to verify
+    // When the DMA buffer is used as the destination of a GPU encryption, the
+    // engine (CE or SEC2) writes the authentication tag here. When the buffer
+    // is decrypted on the CPU the authentication tag is used by CSL to verify
    // the authenticity. The allocation is big enough for one authentication
    // tag per PAGE_SIZE page in the alloc buffer.
    uvm_mem_t *auth_tag;
@@ -98,7 +98,12 @@ typedef struct
    // to the authentication tag. The allocation is big enough for one IV per
    // PAGE_SIZE page in the alloc buffer. The granularity between the decrypt
    // IV and authentication tag must match.
-    UvmCslIv decrypt_iv[(UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE)];
+    UvmCslIv decrypt_iv[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];
+
+    // When the DMA buffer is used as the destination of a GPU encryption, the
+    // key version used during GPU encryption of each PAGE_SIZE page can be
+    // saved here, so CPU decryption uses the correct decryption key.
+    NvU32 key_version[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];

    // Bitmap of the encrypted pages in the backing allocation
    uvm_page_mask_t encrypted_page_mask;
@@ -147,7 +152,7 @@ NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu);
 void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu);

 // Logs encryption information from the GPU and returns the IV.
-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv);
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv);

 // Acquires next CPU encryption IV and returns it.
 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv);
@@ -167,10 +172,14 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
 // CPU side decryption helper. Decrypts data from src_cipher and writes the
 // plain text in dst_plain. src_cipher and dst_plain can't overlap. IV obtained
 // from uvm_conf_computing_log_gpu_encryption() needs to be be passed to src_iv.
+//
+// The caller must indicate which key to use for decryption by passing the
+// appropiate key version number.
 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
+                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer);

@@ -191,12 +200,12 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid);

 // Increment the CPU-side decrypt IV of the CSL context associated with
-// replayable faults. The function is a no-op if the given increment is zero.
+// replayable faults.
 //
 // The IV associated with a fault CSL context is a 64-bit counter.
 //
 // Locking: this function must be invoked while holding the replayable ISR lock.
-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment);
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu);

 // Query the number of remaining messages before IV needs to be rotated.
 void uvm_conf_computing_query_message_pools(uvm_channel_t *channel,
@@ -214,4 +223,71 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 // Check if there are fewer than 'limit' messages available in either direction
 // and rotate if not.
 NV_STATUS uvm_conf_computing_rotate_channel_ivs_below_limit(uvm_channel_t *channel, NvU64 limit, bool retry_if_busy);
+
+// Rotate the engine key associated with the given channel pool.
+NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool);
+
+// Returns true if key rotation is allowed in the channel pool.
+bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool);
+
+// Returns true if key rotation is pending in the channel pool.
+bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool);
+
+// Enable/disable key rotation in the passed GPU. Note that UVM enablement is
+// dependent on RM enablement: key rotation may still be disabled upon calling
+// this function, if it is disabled in RM. On the other hand, key rotation can
+// be disabled in UVM, even if it is enabled in RM.
+//
+// Enablement/Disablement affects only kernel key rotation in keys owned by UVM.
+// It doesn't affect user key rotation (CUDA, Video...), nor it affects RM
+// kernel key rotation.
+void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu);
+void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu);
+
+// Returns true if key rotation is enabled on UVM in the given GPU. Key rotation
+// can be enabled on the GPU but disabled on some of GPU engines (LCEs or SEC2),
+// see uvm_conf_computing_is_key_rotation_enabled_in_pool.
+bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu);
+
+// Launch a synchronous, encrypted copy between CPU and GPU.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The source CPU buffer pointed by src_plain contains the unencrypted (plain
+// text) contents; the function internally performs a CPU-side encryption step
+// before launching the GPU-side CE decryption. The source buffer can be in
+// protected or unprotected sysmem, while the destination buffer must be in
+// protected vidmem.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
+                                                     uvm_gpu_address_t dst_gpu_address,
+                                                     void *src_plain,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...);
+
+// Launch a synchronous, encrypted copy between CPU and GPU.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The source CPU buffer pointed by src_plain contains the unencrypted (plain
+// text) contents; the function internally performs a CPU-side encryption step
+// before launching the GPU-side CE decryption. The source buffer can be in
+// protected or unprotected sysmem, while the destination buffer must be in
+// protected vidmem.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                                     void *dst_plain,
+                                                     uvm_gpu_address_t src_gpu_address,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...);
 #endif // __UVM_CONF_COMPUTING_H__
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -591,7 +591,7 @@ static void fault_buffer_skip_replayable_entry(uvm_parent_gpu_t *parent_gpu, NvU
    // replayable faults still requires manual adjustment so it is kept in sync
    // with the encryption IV on the GSP-RM's side.
    if (g_uvm_global.conf_computing_enabled)
-        uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu, 1);
+        uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu);

    parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, index);
 }
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@@ -60,6 +60,17 @@ struct uvm_gpu_semaphore_pool_page_struct
    // Allocation backing the page
    uvm_rm_mem_t *memory;

+    struct {
+        // Unprotected sysmem storing encrypted value of semaphores
+        uvm_rm_mem_t *encrypted_payload_memory;
+
+        // Unprotected sysmem storing encryption auth tags
+        uvm_rm_mem_t *auth_tag_memory;
+
+        // Unprotected sysmem storing plain text notifier values
+        uvm_rm_mem_t *notifier_memory;
+    } conf_computing;
+
    // Pool the page is part of
    uvm_gpu_semaphore_pool_t *pool;

@@ -80,26 +91,6 @@ static bool gpu_semaphore_is_secure(uvm_gpu_semaphore_t *semaphore)
    return gpu_semaphore_pool_is_secure(semaphore->page->pool);
 }

-static NvU32 get_index(uvm_gpu_semaphore_t *semaphore)
-{
-    NvU32 offset;
-    NvU32 index;
-
-    if (gpu_semaphore_is_secure(semaphore))
-        return semaphore->conf_computing.index;
-
-    UVM_ASSERT(semaphore->payload != NULL);
-    UVM_ASSERT(semaphore->page != NULL);
-
-    offset = (char*)semaphore->payload - (char*)uvm_rm_mem_get_cpu_va(semaphore->page->memory);
-    UVM_ASSERT(offset % UVM_SEMAPHORE_SIZE == 0);
-
-    index = offset / UVM_SEMAPHORE_SIZE;
-    UVM_ASSERT(index < UVM_SEMAPHORE_COUNT_PER_PAGE);
-
-    return index;
-}
-
 // Use canary values on debug builds to catch semaphore use-after-free. We can
 // catch release-after-free by simply setting the payload to a known value at
 // free then checking it on alloc or pool free, but catching acquire-after-free
@@ -150,34 +141,83 @@ static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
    return ((uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu) + rm_mem->size - 1) < gpu->parent->max_host_va);
 }

-// Secure semaphore pools are allocated in the CPR of vidmem and only mapped to
-// the owning GPU as no other processor have access to it.
-static NV_STATUS pool_alloc_secure_page(uvm_gpu_semaphore_pool_t *pool,
-                                        uvm_gpu_semaphore_pool_page_t *pool_page,
-                                        uvm_rm_mem_type_t memory_type)
+static void pool_page_free_buffers(uvm_gpu_semaphore_pool_page_t *page)
+{
+    uvm_rm_mem_free(page->memory);
+    page->memory = NULL;
+
+    if (gpu_semaphore_pool_is_secure(page->pool)) {
+        uvm_rm_mem_free(page->conf_computing.encrypted_payload_memory);
+        uvm_rm_mem_free(page->conf_computing.auth_tag_memory);
+        uvm_rm_mem_free(page->conf_computing.notifier_memory);
+
+        page->conf_computing.encrypted_payload_memory = NULL;
+        page->conf_computing.auth_tag_memory = NULL;
+        page->conf_computing.notifier_memory = NULL;
+    }
+    else {
+        UVM_ASSERT(!page->conf_computing.encrypted_payload_memory);
+        UVM_ASSERT(!page->conf_computing.auth_tag_memory);
+        UVM_ASSERT(!page->conf_computing.notifier_memory);
+    }
+}
+
+static NV_STATUS pool_page_alloc_buffers(uvm_gpu_semaphore_pool_page_t *page)
 {
    NV_STATUS status;
+    uvm_gpu_semaphore_pool_t *pool = page->pool;
+    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;
+    size_t align = 0;
+    bool map_all = true;
+    align = gpu_semaphore_pool_is_secure(pool) ? UVM_CONF_COMPUTING_BUF_ALIGNMENT : 0;
+    map_all = gpu_semaphore_pool_is_secure(pool) ? false : true;

-    UVM_ASSERT(gpu_semaphore_pool_is_secure(pool));
-    status = uvm_rm_mem_alloc(pool->gpu,
-                              memory_type,
-                              UVM_SEMAPHORE_PAGE_SIZE,
-                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                              &pool_page->memory);
+    if (map_all)
+        status = uvm_rm_mem_alloc_and_map_all(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);
+    else
+        status = uvm_rm_mem_alloc(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);

    if (status != NV_OK)
-        return status;
+        goto error;
+
+    if (!gpu_semaphore_pool_is_secure(pool))
+        return NV_OK;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_PAGE_SIZE,
+                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                          &page->conf_computing.encrypted_payload_memory);
+    if (status != NV_OK)
+        goto error;
+
+    BUILD_BUG_ON(UVM_CONF_COMPUTING_AUTH_TAG_SIZE % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_COUNT_PER_PAGE * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                          &page->conf_computing.auth_tag_memory);
+    if (status != NV_OK)
+        goto error;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_SEMAPHORE_COUNT_PER_PAGE * sizeof(NvU32),
+                                          0,
+                                          &page->conf_computing.notifier_memory);
+    if (status != NV_OK)
+        goto error;

    return NV_OK;
+error:
+    pool_page_free_buffers(page);
+    return status;
 }

 static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
 {
    NV_STATUS status;
    uvm_gpu_semaphore_pool_page_t *pool_page;
-    NvU32 *payloads;
-    size_t i;
-    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;

    uvm_assert_mutex_locked(&pool->mutex);

@@ -188,24 +228,9 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)

    pool_page->pool = pool;

-    // Whenever the Confidential Computing feature is enabled, engines can
-    // access semaphores only in the CPR of vidmem. Mapping to other GPUs is
-    // also disabled.
-    if (gpu_semaphore_pool_is_secure(pool)) {
-        status = pool_alloc_secure_page(pool, pool_page, memory_type);
-
-        if (status != NV_OK)
-            goto error;
-    }
-    else {
-    status = uvm_rm_mem_alloc_and_map_all(pool->gpu,
-                                          memory_type,
-                                          UVM_SEMAPHORE_PAGE_SIZE,
-                                          0,
-                                          &pool_page->memory);
+    status = pool_page_alloc_buffers(pool_page);
    if (status != NV_OK)
        goto error;
-    }

    // Verify the GPU can access the semaphore pool.
    UVM_ASSERT(gpu_can_access_semaphore_pool(pool->gpu, pool_page->memory));
@@ -217,7 +242,9 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
    pool->free_semaphores_count += UVM_SEMAPHORE_COUNT_PER_PAGE;

    if (semaphore_uses_canary(pool)) {
-        payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
+        size_t i;
+        NvU32 *payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
+
        for (i = 0; i < UVM_SEMAPHORE_COUNT_PER_PAGE; i++)
            payloads[i] = make_canary(0);
    }
@@ -253,7 +280,7 @@ static void pool_free_page(uvm_gpu_semaphore_pool_page_t *page)

    pool->free_semaphores_count -= UVM_SEMAPHORE_COUNT_PER_PAGE;
    list_del(&page->all_pages_node);
-    uvm_rm_mem_free(page->memory);
+    pool_page_free_buffers(page);
    uvm_kvfree(page);
 }

@@ -273,19 +300,22 @@ NV_STATUS uvm_gpu_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_gpu_semaph
        goto done;

    list_for_each_entry(page, &pool->pages, all_pages_node) {
-        NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
+        const NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
+
+        UVM_ASSERT(semaphore_index <= UVM_SEMAPHORE_COUNT_PER_PAGE);
+
        if (semaphore_index == UVM_SEMAPHORE_COUNT_PER_PAGE)
            continue;

-        if (gpu_semaphore_pool_is_secure(pool)) {
-            semaphore->conf_computing.index = semaphore_index;
-        }
-        else {
-            semaphore->payload = (NvU32*)((char*)uvm_rm_mem_get_cpu_va(page->memory) +
-                                                 semaphore_index * UVM_SEMAPHORE_SIZE);
-        }
-
        semaphore->page = page;
+        semaphore->index = semaphore_index;
+
+        if (gpu_semaphore_pool_is_secure(pool)) {
+
+            // Reset the notifier to prevent detection of false attack when
+            // checking for updated value
+            *uvm_gpu_semaphore_get_notifier_cpu_va(semaphore) = semaphore->conf_computing.last_observed_notifier;
+        }

        if (semaphore_uses_canary(pool))
            UVM_ASSERT(is_canary(uvm_gpu_semaphore_get_payload(semaphore)));
@@ -311,7 +341,6 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
 {
    uvm_gpu_semaphore_pool_page_t *page;
    uvm_gpu_semaphore_pool_t *pool;
-    NvU32 index;

    UVM_ASSERT(semaphore);

@@ -323,7 +352,6 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
        return;

    pool = page->pool;
-    index = get_index(semaphore);

    // Write a known value lower than the current payload in an attempt to catch
    // release-after-free and acquire-after-free.
@@ -333,10 +361,9 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
    uvm_mutex_lock(&pool->mutex);

    semaphore->page = NULL;
-    semaphore->payload = NULL;

    ++pool->free_semaphores_count;
-    __set_bit(index, page->free_semaphores);
+    __set_bit(semaphore->index, page->free_semaphores);

    uvm_mutex_unlock(&pool->mutex);
 }
@@ -449,18 +476,72 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space)
 {
-    NvU32 index = get_index(semaphore);
    NvU64 base_va = uvm_rm_mem_get_gpu_va(semaphore->page->memory, gpu, is_proxy_va_space).address;

-    return base_va + UVM_SEMAPHORE_SIZE * index;
+    return base_va + semaphore->index * UVM_SEMAPHORE_SIZE;
+}
+
+NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *base_va;
+
+    if (gpu_semaphore_is_secure(semaphore))
+        return &semaphore->conf_computing.cached_payload;
+
+    base_va = uvm_rm_mem_get_cpu_va(semaphore->page->memory);
+    return (NvU32*)(base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *encrypted_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.encrypted_payload_memory);
+
+    return (NvU32*)(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 encrypted_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.encrypted_payload_memory,
+                                                        semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
+}
+
+uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    uvm_gpu_semaphore_notifier_t *notifier_base_va =
+        uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.notifier_memory);
+
+    return notifier_base_va + semaphore->index;
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 notifier_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.notifier_memory,
+                                                       semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(notifier_base_va +
+                                               semaphore->index * sizeof(uvm_gpu_semaphore_notifier_t));
+}
+
+void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    char *auth_tag_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.auth_tag_memory);
+
+    return (void*)(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
+}
+
+uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU64 auth_tag_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.auth_tag_memory,
+                                                       semaphore->page->pool->gpu);
+
+    return uvm_gpu_address_virtual_unprotected(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
 }

 NvU32 uvm_gpu_semaphore_get_payload(uvm_gpu_semaphore_t *semaphore)
 {
-    if (gpu_semaphore_is_secure(semaphore))
-        return UVM_GPU_READ_ONCE(semaphore->conf_computing.cached_payload);
-
-    return UVM_GPU_READ_ONCE(*semaphore->payload);
+    return UVM_GPU_READ_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore));
 }

 void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload)
@@ -477,10 +558,7 @@ void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload
    // the GPU correctly even on non-SMP).
    mb();

-    if (gpu_semaphore_is_secure(semaphore))
-            UVM_GPU_WRITE_ONCE(semaphore->conf_computing.cached_payload, payload);
-    else
-    UVM_GPU_WRITE_ONCE(*semaphore->payload, payload);
+    UVM_GPU_WRITE_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore), payload);
 }

 // This function is intended to catch channels which have been left dangling in
@@ -546,22 +624,11 @@ void uvm_gpu_tracking_semaphore_free(uvm_gpu_tracking_semaphore_t *tracking_sem)
    uvm_gpu_semaphore_free(&tracking_sem->semaphore);
 }

-static bool should_skip_secure_semaphore_update(NvU32 last_observed_notifier, NvU32 gpu_notifier)
+static void gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
 {
-    // No new value, or the GPU is currently writing the new encrypted material
-    // and no change in value would still result in corrupted data.
-    return (last_observed_notifier == gpu_notifier) || (gpu_notifier % 2);
-}
-
-static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
-{
-    UvmCslIv local_iv;
    NvU32 local_payload;
-    NvU32 new_sem_value;
-    NvU32 gpu_notifier;
-    NvU32 last_observed_notifier;
-    NvU32 new_gpu_notifier = 0;
-    NvU32 iv_index = 0;
+    uvm_gpu_semaphore_notifier_t gpu_notifier;
+    uvm_gpu_semaphore_notifier_t new_gpu_notifier = 0;

    // A channel can have multiple entries pending and the tracking semaphore
    // update of each entry can race with this function. Since the semaphore
@@ -570,64 +637,72 @@ static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, u
    unsigned tries_left = channel->num_gpfifo_entries;
    NV_STATUS status = NV_OK;
    NvU8 local_auth_tag[UVM_CONF_COMPUTING_AUTH_TAG_SIZE];
-    UvmCslIv *ivs_cpu_addr = semaphore->conf_computing.ivs;
-    void *auth_tag_cpu_addr = uvm_rm_mem_get_cpu_va(semaphore->conf_computing.auth_tag);
-    NvU32 *gpu_notifier_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.notifier);
-    NvU32 *payload_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.encrypted_payload);
+    uvm_gpu_semaphore_notifier_t *semaphore_notifier_cpu_addr = uvm_gpu_semaphore_get_notifier_cpu_va(semaphore);

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    last_observed_notifier = semaphore->conf_computing.last_observed_notifier;
-    gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
-    UVM_ASSERT(last_observed_notifier <= gpu_notifier);
-
-    if (should_skip_secure_semaphore_update(last_observed_notifier, gpu_notifier))
-        return;
-
    do {
-        gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
+        gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
+
+        UVM_ASSERT(gpu_notifier >= semaphore->conf_computing.last_observed_notifier);

        // Odd notifier value means there's an update in progress.
        if (gpu_notifier % 2)
            continue;

+        // There's no change since last time
+        if (gpu_notifier == semaphore->conf_computing.last_observed_notifier)
+            return;
+
        // Make sure no memory accesses happen before we read the notifier
        smp_mb__after_atomic();

-        iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
-        memcpy(local_auth_tag, auth_tag_cpu_addr, sizeof(local_auth_tag));
-        local_payload = UVM_READ_ONCE(*payload_cpu_addr);
-        memcpy(&local_iv, &ivs_cpu_addr[iv_index], sizeof(local_iv));
+        memcpy(local_auth_tag, uvm_gpu_semaphore_get_auth_tag_cpu_va(semaphore), sizeof(local_auth_tag));
+        local_payload = UVM_READ_ONCE(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));

        // Make sure the second read of notifier happens after
        // all memory accesses.
        smp_mb__before_atomic();
-        new_gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
+        new_gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
        tries_left--;
    } while ((tries_left > 0) && ((gpu_notifier != new_gpu_notifier) || (gpu_notifier % 2)));

    if (!tries_left) {
        status = NV_ERR_INVALID_STATE;
-        goto error;
    }
+    else {
+        NvU32 key_version;
+        const NvU32 iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
+        NvU32 new_semaphore_value;
+
+        UVM_ASSERT(gpu_notifier == new_gpu_notifier);
+        UVM_ASSERT(gpu_notifier % 2 == 0);
+
+        // CPU decryption is guaranteed to use the same key version as the
+        // associated GPU encryption, because if there was any key rotation in
+        // between, then key rotation waited for all channels to complete before
+        // proceeding. The wait implies that the semaphore value matches the
+        // last one encrypted on the GPU, so this CPU decryption should happen
+        // before the key is rotated.
+        key_version = uvm_channel_pool_key_version(channel->pool);

-    if (gpu_notifier == new_gpu_notifier) {
        status = uvm_conf_computing_cpu_decrypt(channel,
-                                                &new_sem_value,
+                                                &new_semaphore_value,
                                                &local_payload,
-                                                &local_iv,
-                                                sizeof(new_sem_value),
+                                                &semaphore->conf_computing.ivs[iv_index],
+                                                key_version,
+                                                sizeof(new_semaphore_value),
                                                &local_auth_tag);

        if (status != NV_OK)
            goto error;

-        uvm_gpu_semaphore_set_payload(semaphore, new_sem_value);
+        uvm_gpu_semaphore_set_payload(semaphore, new_semaphore_value);
        UVM_WRITE_ONCE(semaphore->conf_computing.last_observed_notifier, new_gpu_notifier);
-    }

-    return;
+        return;
+    }

 error:
    // Decryption failure is a fatal error as well as running out of try left.
@@ -650,11 +725,11 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    else
        uvm_assert_spinlock_locked(&tracking_semaphore->s_lock);

-    if (tracking_semaphore->semaphore.conf_computing.encrypted_payload) {
+    if (gpu_semaphore_is_secure(&tracking_semaphore->semaphore)) {
        // TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore
        //                     mechanism to all semaphore
        uvm_channel_t *channel = container_of(tracking_semaphore, uvm_channel_t, tracking_sem);
-        uvm_gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
+        gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
    }

    new_sem_value = uvm_gpu_semaphore_get_payload(&tracking_semaphore->semaphore);
@@ -690,7 +765,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    UVM_ASSERT_MSG_RELEASE(new_value - old_value <= UVM_GPU_SEMAPHORE_MAX_JUMP,
                           "GPU %s unexpected semaphore (CPU VA 0x%llx) jump from 0x%llx to 0x%llx\n",
                           uvm_gpu_name(tracking_semaphore->semaphore.page->pool->gpu),
-                           (NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
+                           (NvU64)(uintptr_t)uvm_gpu_semaphore_get_cpu_va(&tracking_semaphore->semaphore),
                           old_value, new_value);

    // Use an atomic write even though the lock is held so that the value can
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
@@ -29,6 +29,8 @@
 #include "uvm_rm_mem.h"
 #include "uvm_linux.h"

+typedef NvU32 uvm_gpu_semaphore_notifier_t;
+
 // A GPU semaphore is a memory location accessible by the GPUs and the CPU
 // that's used for synchronization among them.
 // The GPU has primitives to acquire (wait for) and release (set) 4-byte memory
@@ -45,17 +47,15 @@ struct uvm_gpu_semaphore_struct
    // The semaphore pool page the semaphore came from
    uvm_gpu_semaphore_pool_page_t *page;

-    // Pointer to the memory location
-    NvU32 *payload;
+    // Index of the semaphore in semaphore page
+    NvU16 index;
+
    struct {
-        NvU16 index;
-        NvU32 cached_payload;
-        uvm_rm_mem_t *encrypted_payload;
-        uvm_rm_mem_t *notifier;
-        uvm_rm_mem_t *auth_tag;
        UvmCslIv *ivs;
-        NvU32 last_pushed_notifier;
-        NvU32 last_observed_notifier;
+        NvU32 cached_payload;
+
+        uvm_gpu_semaphore_notifier_t last_pushed_notifier;
+        uvm_gpu_semaphore_notifier_t last_observed_notifier;
    } conf_computing;
 };

@@ -151,6 +151,17 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space);

+NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore);
+
+NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
+uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
+void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore);
+
 // Read the 32-bit payload of the semaphore
 // Notably doesn't provide any memory ordering guarantees and needs to be used with
 // care. For an example of what needs to be considered see
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -284,8 +284,10 @@ static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,

    // Reset preferred location and accessed-by of policy nodes if needed.
    uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
-        if (uvm_id_equal(node->policy.preferred_location, gpu->id))
+        if (uvm_va_policy_preferred_location_equal(&node->policy, gpu->id, NUMA_NO_NODE)) {
            node->policy.preferred_location = UVM_ID_INVALID;
+            node->policy.preferred_nid = NUMA_NO_NODE;
+        }

        uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
    }
--- a/kernel-open/nvidia-uvm/uvm_lock.c
+++ b/kernel-open/nvidia-uvm/uvm_lock.c
@@ -27,7 +27,7 @@

 const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
 {
-    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 34);
+    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 36);

    switch (lock_order) {
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_INVALID);
@@ -48,7 +48,9 @@ const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CHUNK_MAPPING);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PAGE_TREE);
+        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_PUSH);
+        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION_WLC);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_WLC_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_SEC2_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PUSH);
--- a/kernel-open/nvidia-uvm/uvm_lock.h
+++ b/kernel-open/nvidia-uvm/uvm_lock.h
@@ -322,6 +322,15 @@
 //      Operations not allowed while holding this lock
 //      - GPU memory allocation which can evict
 //
+// - Channel pool key rotation lock
+//      Order: UVM_LOCK_ORDER_KEY_ROTATION
+//      Condition: Confidential Computing is enabled
+//      Mutex per channel pool
+//
+//      The lock ensures mutual exclusion during key rotation affecting all the
+//      channels in the associated pool. Key rotation in WLC pools is handled
+//      using a separate lock order, see UVM_LOCK_ORDER_KEY_ROTATION_WLC below.
+//
 // - CE channel CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_PUSH
 //      Condition: The Confidential Computing feature is enabled
@@ -338,6 +347,15 @@
 //      Operations allowed while holding this lock
 //      - Pushing work to CE channels (except for WLC channels)
 //
+// - WLC channel pool key rotation lock
+//      Order: UVM_LOCK_ORDER_KEY_ROTATION_WLC
+//      Condition: Confidential Computing is enabled
+//      Mutex of WLC channel pool
+//
+//      The lock has the same purpose as the regular channel pool key rotation
+//      lock. Using a different order lock for WLC channels allows key rotation
+//      on those channels during indirect work submission.
+//
 // - WLC CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_WLC_PUSH
 //      Condition: The Confidential Computing feature is enabled
@@ -484,7 +502,9 @@ typedef enum
    UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL,
    UVM_LOCK_ORDER_CHUNK_MAPPING,
    UVM_LOCK_ORDER_PAGE_TREE,
+    UVM_LOCK_ORDER_KEY_ROTATION,
    UVM_LOCK_ORDER_CSL_PUSH,
+    UVM_LOCK_ORDER_KEY_ROTATION_WLC,
    UVM_LOCK_ORDER_CSL_WLC_PUSH,
    UVM_LOCK_ORDER_CSL_SEC2_PUSH,
    UVM_LOCK_ORDER_PUSH,
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@@ -39,6 +39,7 @@
 #include "uvm_pte_batch.h"
 #include "uvm_tlb_batch.h"
 #include "nv_uvm_interface.h"
+#include "nv_uvm_types.h"

 #include "uvm_pushbuffer.h"

@@ -101,11 +102,11 @@ static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,

    pte_buffer->va_range = va_range;
    pte_buffer->gpu = gpu;
-    pte_buffer->mapping_info.cachingType = map_rm_params->caching_type;
-    pte_buffer->mapping_info.mappingType = map_rm_params->mapping_type;
-    pte_buffer->mapping_info.formatType = map_rm_params->format_type;
-    pte_buffer->mapping_info.elementBits = map_rm_params->element_bits;
-    pte_buffer->mapping_info.compressionType = map_rm_params->compression_type;
+    pte_buffer->mapping_info.cachingType        = (UvmRmGpuCachingType) map_rm_params->caching_type;
+    pte_buffer->mapping_info.mappingType        = (UvmRmGpuMappingType) map_rm_params->mapping_type;
+    pte_buffer->mapping_info.formatType         = (UvmRmGpuFormatType) map_rm_params->format_type;
+    pte_buffer->mapping_info.elementBits        = (UvmRmGpuFormatElementBits) map_rm_params->element_bits;
+    pte_buffer->mapping_info.compressionType    = (UvmRmGpuCompressionType) map_rm_params->compression_type;
    if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL)
        pte_buffer->mapping_info.mappingPageSize = page_size;

--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -589,7 +589,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                    skipped_migrate = true;
            }
            else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
-                     !uvm_id_equal(dest_id, policy->preferred_location)) {
+                     !uvm_va_policy_preferred_location_equal(policy, dest_id, NUMA_NO_NODE)) {
                // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
                // unless it's the preferred location
                status = NV_ERR_INVALID_DEVICE;
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@@ -126,7 +126,7 @@ NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sys
            NvU64 remove_key;

            for (remove_key = base_key; remove_key < key; ++remove_key)
-                (void *)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);
+                (void)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);

            kmem_cache_free(g_reverse_page_map_cache, new_reverse_map);
            status = errno_to_nv_status(ret);
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@@ -671,6 +671,9 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,

    uvm_assert_mutex_locked(&va_block->lock);

+    // Force CPU page residency to be on the preferred NUMA node.
+    va_block_context->make_resident.dest_nid = uvm_va_range_get_policy(va_block->va_range)->preferred_nid;
+
    for_each_id_in_mask(src_id, &va_block->resident) {
        NV_STATUS status;
        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@@ -100,16 +100,8 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas

 bool uvm_numa_id_eq(int nid0, int nid1)
 {
-    UVM_ASSERT(nid0 == -1 || nid0 < MAX_NUMNODES);
-    UVM_ASSERT(nid1 == -1 || nid1 < MAX_NUMNODES);
-
-    if ((nid0 == NUMA_NO_NODE || nid1 == NUMA_NO_NODE) && nodes_weight(node_possible_map) == 1) {
-        if (nid0 == NUMA_NO_NODE)
-            nid0 = first_node(node_possible_map);
-
-        if (nid1 == NUMA_NO_NODE)
-            nid1 = first_node(node_possible_map);
-    }
+    UVM_ASSERT(nid0 >= NUMA_NO_NODE && nid0 < MAX_NUMNODES);
+    UVM_ASSERT(nid1 >= NUMA_NO_NODE && nid1 < MAX_NUMNODES);

    return nid0 == nid1;
 }
--- a/kernel-open/nvidia-uvm/uvm_push.h
+++ b/kernel-open/nvidia-uvm/uvm_push.h
@@ -65,9 +65,12 @@ typedef enum
 } uvm_push_flag_t;

 struct uvm_push_crypto_bundle_struct {
-    // Initialization vector used to decrypt the push
+    // Initialization vector used to decrypt the push on the CPU
    UvmCslIv iv;

+    // Key version used to decrypt the push on the CPU
+    NvU32 key_version;
+
    // Size of the pushbuffer that is encrypted/decrypted
    NvU32 push_size;
 };
--- a/kernel-open/nvidia-uvm/uvm_pushbuffer.c
+++ b/kernel-open/nvidia-uvm/uvm_pushbuffer.c
@@ -451,7 +451,6 @@ static uvm_pushbuffer_chunk_t *gpfifo_to_chunk(uvm_pushbuffer_t *pushbuffer, uvm
 static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
 {
    NV_STATUS status;
-    NvU32 auth_tag_offset;
    void *auth_tag_cpu_va;
    void *push_protected_cpu_va;
    void *push_unprotected_cpu_va;
@@ -470,16 +469,15 @@ static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
    UVM_ASSERT(!uvm_channel_is_wlc(channel));
    UVM_ASSERT(!uvm_channel_is_lcic(channel));

-    push_protected_cpu_va = (char *)get_base_cpu_va(pushbuffer) + pushbuffer_offset;
+    push_protected_cpu_va = get_base_cpu_va(pushbuffer) + pushbuffer_offset;
    push_unprotected_cpu_va = (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem) + pushbuffer_offset;
-    auth_tag_offset = push_info_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-    auth_tag_cpu_va = (char *)uvm_rm_mem_get_cpu_va(channel->conf_computing.push_crypto_bundle_auth_tags) +
-                              auth_tag_offset;
+    auth_tag_cpu_va = uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(channel, push_info_index);

    status = uvm_conf_computing_cpu_decrypt(channel,
                                            push_protected_cpu_va,
                                            push_unprotected_cpu_va,
                                            &crypto_bundle->iv,
+                                            crypto_bundle->key_version,
                                            crypto_bundle->push_size,
                                            auth_tag_cpu_va);

@@ -558,7 +556,7 @@ NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_
    if (uvm_channel_is_wlc(push->channel) || uvm_channel_is_lcic(push->channel)) {
        // We need to use the same static locations for PB as the fixed
        // schedule because that's what the channels are initialized to use.
-        return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_protected_vidmem, gpu);
+        return uvm_channel_get_static_pb_protected_vidmem_gpu_va(push->channel);
    }
    else if (uvm_channel_is_sec2(push->channel)) {
        // SEC2 PBs are in unprotected sysmem
@@ -575,7 +573,7 @@ void *uvm_pushbuffer_get_unprotected_cpu_va_for_push(uvm_pushbuffer_t *pushbuffe
    if (uvm_channel_is_wlc(push->channel)) {
        // Reuse existing WLC static pb for initialization
        UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
-        return push->channel->conf_computing.static_pb_unprotected_sysmem_cpu;
+        return uvm_channel_get_static_pb_unprotected_sysmem_cpu(push->channel);
    }

    pushbuffer_base = uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem);
@@ -590,8 +588,8 @@ NvU64 uvm_pushbuffer_get_unprotected_gpu_va_for_push(uvm_pushbuffer_t *pushbuffe
    if (uvm_channel_is_wlc(push->channel)) {
        // Reuse existing WLC static pb for initialization
        UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
-        return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_unprotected_sysmem,
-                                         uvm_push_get_gpu(push));
+
+        return uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(push->channel);
    }

    pushbuffer_base = uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory_unprotected_sysmem, uvm_push_get_gpu(push));
--- a/kernel-open/nvidia-uvm/uvm_sec2_test.c
+++ b/kernel-open/nvidia-uvm/uvm_sec2_test.c
@@ -322,6 +322,7 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
                             uvm_mem_t *dst_mem,
                             uvm_mem_t *src_mem,
                             UvmCslIv *decrypt_iv,
+                             NvU32 key_version,
                             uvm_mem_t *auth_tag_mem,
                             size_t size,
                             size_t copy_size)
@@ -338,6 +339,7 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
                                                         dst_plain,
                                                         src_cipher,
                                                         &decrypt_iv[i],
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer));

@@ -368,7 +370,7 @@ static void gpu_encrypt(uvm_push_t *push,
    uvm_gpu_address_t auth_tag_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);

    for (i = 0; i < num_iterations; i++) {
-        uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
+        uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);

        if (i > 0)
            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
@@ -427,6 +429,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    uvm_push_t push;
    UvmCslIv *decrypt_iv;
+    NvU32 key_version;

    decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
    if (!decrypt_iv)
@@ -456,6 +459,11 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz

    gpu_encrypt(&push, dst_cipher, dst_plain, decrypt_iv, auth_tag_mem, size, copy_size);

+    // There shouldn't be any key rotation between the end of the push and the
+    // CPU decryption(s), but it is more robust against test changes to force
+    // decryption to use the saved key.
+    key_version = uvm_channel_pool_key_version(push.channel->pool);
+
    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);

    TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher), out);
@@ -465,6 +473,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
                                   dst_plain_cpu,
                                   dst_cipher,
                                   decrypt_iv,
+                                   key_version,
                                   auth_tag_mem,
                                   size,
                                   copy_size),
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@@ -124,24 +124,23 @@ static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
 static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS *params, struct file *filp)
 {
    uvm_gpu_t *gpu;
-    NV_STATUS status;
-    uvm_rm_user_object_t user_rm_va_space = {
-        .rm_control_fd = -1,
-        .user_client = params->client,
-        .user_object = params->smc_part_ref
-    };
+    NV_STATUS status = NV_OK;

    if (!UVM_THREAD_AFFINITY_SUPPORTED())
        return NV_ERR_NOT_SUPPORTED;

-    status = uvm_gpu_retain_by_uuid(&params->gpu_uuid, &user_rm_va_space, &gpu);
-    if (status != NV_OK)
-        return status;
+    uvm_mutex_lock(&g_uvm_global.global_lock);
+
+    gpu = uvm_gpu_get_by_uuid(&params->gpu_uuid);
+    if (!gpu) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto unlock;
+    }

    // If the GPU is not attached to a NUMA node, there is nothing to do.
    if (gpu->parent->closest_cpu_numa_node == NUMA_NO_NODE) {
        status = NV_ERR_NOT_SUPPORTED;
-        goto release;
+        goto unlock;
    }

    if (gpu->parent->replayable_faults_supported) {
@@ -150,7 +149,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                              gpu->parent->closest_cpu_numa_node);
        uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
        if (status != NV_OK)
-            goto release;
+            goto unlock;

        if (gpu->parent->non_replayable_faults_supported) {
            uvm_parent_gpu_non_replayable_faults_isr_lock(gpu->parent);
@@ -158,7 +157,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                                  gpu->parent->closest_cpu_numa_node);
            uvm_parent_gpu_non_replayable_faults_isr_unlock(gpu->parent);
            if (status != NV_OK)
-                goto release;
+                goto unlock;
        }

        if (gpu->parent->access_counters_supported) {
@@ -168,8 +167,9 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
            uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
        }
    }
-release:
-    uvm_gpu_release(gpu);
+
+unlock:
+    uvm_mutex_unlock(&g_uvm_global.global_lock);
    return status;
 }

--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@@ -347,20 +347,30 @@ typedef enum
    UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH = 0,
    UVM_TEST_CHANNEL_STRESS_MODE_UPDATE_CHANNELS,
    UVM_TEST_CHANNEL_STRESS_MODE_STREAM,
+    UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION,
 } UVM_TEST_CHANNEL_STRESS_MODE;

+typedef enum
+{
+    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU,
+    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU,
+    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE,
+} UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION;
+
 #define UVM_TEST_CHANNEL_STRESS                          UVM_TEST_IOCTL_BASE(15)
 typedef struct
 {
-    NvU32     mode;                   // In
+    NvU32     mode;                   // In, one of UVM_TEST_CHANNEL_STRESS_MODE

    // Number of iterations:
    //   mode == NOOP_PUSH: number of noop pushes
    //   mode == UPDATE_CHANNELS: number of updates
    //   mode == STREAM: number of iterations per stream
+    //   mode == ROTATION: number of operations
    NvU32     iterations;

-    NvU32     num_streams;            // In, used only for mode == UVM_TEST_CHANNEL_STRESS_MODE_STREAM
+    NvU32     num_streams;            // In, used only if mode == STREAM
+    NvU32     key_rotation_operation; // In, used only if mode == ROTATION
    NvU32     seed;                   // In
    NvU32     verbose;                // In
    NV_STATUS rmStatus;               // Out
@@ -1210,8 +1220,6 @@ typedef struct
 typedef struct
 {
    NvProcessorUuid                 gpu_uuid;                                           // In
-    NvHandle                        client;                                             // In
-    NvHandle                        smc_part_ref;                                       // In

    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS;
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -725,8 +725,9 @@ bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, u
 }

 // Return the preferred NUMA node ID for the block's policy.
-// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
-// is returned.
+// If the preferred node ID is NUMA_NO_NODE, the nearest NUMA node ID
+// with memory is returned. In most cases, this should be the current
+// NUMA node.
 static int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
 {
    if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
@@ -2070,6 +2071,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    uvm_page_mask_t *allocated_mask;
    uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
+    const uvm_va_policy_t *policy = uvm_va_policy_get_region(block, populate_region);
    uvm_page_index_t page_index;
    uvm_gpu_id_t id;
    int preferred_nid = block_context->make_resident.dest_nid;
@@ -2077,6 +2079,10 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
        preferred_nid = block_test->cpu_chunk_allocation_target_id;

+    // If the VA range has a preferred NUMA node, use it.
+    if (preferred_nid == NUMA_NO_NODE)
+        preferred_nid = policy->preferred_nid;
+
    // TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
    if (preferred_nid != NUMA_NO_NODE) {
        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
@@ -2127,13 +2133,12 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
        uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
        uvm_chunk_sizes_mask_t allocation_sizes;

-        if (uvm_page_mask_test(allocated_mask, page_index)) {
+        if (uvm_page_mask_test(allocated_mask, page_index) ||
+            uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index)) {
            page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
            continue;
        }

-        UVM_ASSERT(!uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index));
-
        allocation_sizes = block_calculate_largest_alloc_size(block,
                                                              page_index,
                                                              allocated_mask,
@@ -3843,6 +3848,7 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
    uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
    uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
    uvm_gpu_address_t src_address = block_copy_get_address(block, &copy_state->src, page_index, gpu);
+    NvU32 key_version = uvm_channel_pool_key_version(push->channel->pool);

    UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
    UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
@@ -3860,7 +3866,8 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
    // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
    // encryptions and decryptions must happen on a PAGE_SIZE basis.
    for_each_va_block_page_in_region(page_index, region) {
-        uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]);
+        uvm_conf_computing_log_gpu_encryption(push->channel, PAGE_SIZE, &dma_buffer->decrypt_iv[page_index]);
+        dma_buffer->key_version[page_index] = key_version;

        // All but the first encryption can be pipelined. The first encryption
        // uses the caller's pipelining settings.
@@ -3919,7 +3926,8 @@ static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
        status = uvm_conf_computing_cpu_decrypt(push->channel,
                                                cpu_page_address,
                                                staging_buffer,
-                                                &dma_buffer->decrypt_iv[page_index],
+                                                dma_buffer->decrypt_iv + page_index,
+                                                dma_buffer->key_version[page_index],
                                                PAGE_SIZE,
                                                auth_tag_buffer);
        kunmap(dst_page);
@@ -4037,7 +4045,7 @@ static NV_STATUS block_copy_pages(uvm_va_block_t *va_block,

        UVM_ASSERT(dst_chunk);
        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) >= uvm_va_block_region_size(region));
-        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) <= uvm_cpu_chunk_get_size(dst_chunk));
+        UVM_ASSERT(uvm_va_block_region_size(region) <= uvm_cpu_chunk_get_size(dst_chunk));

        // CPU-to-CPU copies using memcpy() don't have any inherent ordering with
        // copies using GPU CEs. So, we have to make sure that all previously
@@ -5132,7 +5140,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_page_mask_t *dst_resident_mask;
    uvm_page_mask_t *migrated_pages;
    uvm_page_mask_t *staged_pages;
-    uvm_page_mask_t *first_touch_mask;
+    uvm_page_mask_t *scratch_residency_mask;

    // TODO: Bug 3660922: need to implement HMM read duplication support.
    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
@@ -5151,6 +5159,10 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(!uvm_va_block_is_dead(va_block));

+    scratch_residency_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
+    if (!scratch_residency_mask)
+        return NV_ERR_NO_MEMORY;
+
    // For pages that are entering read-duplication we need to unmap remote
    // mappings and revoke RW and higher access permissions.
    //
@@ -5177,12 +5189,12 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

        status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
        if (status != NV_OK)
-            return status;
+            goto out;
    }

    status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
    if (status != NV_OK)
-        return status;
+        goto out;

    status = block_copy_resident_pages(va_block,
                                       va_block_context,
@@ -5192,22 +5204,17 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
                                       prefetch_page_mask,
                                       UVM_VA_BLOCK_TRANSFER_MODE_COPY);
    if (status != NV_OK)
-        return status;
+        goto out;

    // Pages that weren't resident anywhere else were populated at the
    // destination directly. Mark them as resident now, since there were no
    // errors from block_copy_resident_pages() above.
-    // Note that va_block_context->scratch_page_mask is passed to
-    // block_copy_set_first_touch_residency() which is generally unsafe but in
-    // this case, block_copy_set_first_touch_residency() copies page_mask
-    // before scratch_page_mask could be clobbered.
    migrated_pages = &va_block_context->make_resident.pages_migrated;
-    first_touch_mask = &va_block_context->scratch_page_mask;
-    uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
-    uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);
+    uvm_page_mask_init_from_region(scratch_residency_mask, region, page_mask);
+    uvm_page_mask_andnot(scratch_residency_mask, scratch_residency_mask, migrated_pages);

-    if (!uvm_page_mask_empty(first_touch_mask))
-        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);
+    if (!uvm_page_mask_empty(scratch_residency_mask))
+        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, scratch_residency_mask);

    staged_pages = &va_block_context->make_resident.pages_staged;
    if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
@@ -5219,6 +5226,18 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

    if (!uvm_page_mask_empty(migrated_pages)) {
        if (UVM_ID_IS_CPU(dest_id)) {
+            // Check if the CPU is already in the resident set of processors.
+            // We need to do this since we can't have multiple NUMA nodes with
+            // resident pages.
+            // If any of the migrate pages were already resident on the CPU, the
+            // residency has to be switched to the destination NUMA node.
+            if (uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) &&
+                uvm_page_mask_and(scratch_residency_mask,
+                                  uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE),
+                                  migrated_pages)) {
+                uvm_va_block_cpu_clear_resident_all_chunks(va_block, va_block_context, scratch_residency_mask);
+            }
+
            uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
        }
        else {
@@ -5247,7 +5266,9 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    // Check state of all chunks after residency change.
    // TODO: Bug 4207783: Check both CPU and GPU chunks.
    UVM_ASSERT(block_check_cpu_chunks(va_block));
-    return NV_OK;
+out:
+    kmem_cache_free(g_uvm_page_mask_cache, scratch_residency_mask);
+    return status;
 }

 // Looks up the current CPU mapping state of page from the
@@ -5532,13 +5553,15 @@ static bool block_check_mappings_page(uvm_va_block_t *block,
                   *block->read_duplicated_pages.bitmap);

    // Test read_duplicated_pages mask
-    UVM_ASSERT_MSG((uvm_processor_mask_get_count(resident_processors) <= 1 &&
-                     !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
-                   (uvm_processor_mask_get_count(resident_processors) > 1 &&
-                     uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
+    UVM_ASSERT_MSG((!uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
+                    uvm_processor_mask_get_count(resident_processors) <= 1) ||
+                   (uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
+                    uvm_processor_mask_get_count(resident_processors) >= 1),
                   "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
                   *resident_processors->bitmap,
-                   *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
+                   *read_mappings->bitmap,
+                   *write_mappings->bitmap,
+                   *atomic_mappings->bitmap,
                   *va_space->system_wide_atomics_enabled_processors.bitmap,
                   *block->read_duplicated_pages.bitmap);

@@ -6022,7 +6045,7 @@ static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
        if (uvm_page_mask_empty(mapped_pages))
            return false;

-        return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
+        return !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), gpu_id, NUMA_NO_NODE);
    }

    // Remote pages are pages which are mapped but not resident locally
@@ -8365,6 +8388,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
                                  uvm_va_block_context_t *block_context,
                                  uvm_gpu_t *gpu,
                                  uvm_processor_id_t resident_id,
+                                  int resident_nid,
                                  uvm_page_mask_t *map_page_mask,
                                  uvm_prot_t new_prot,
                                  uvm_tracker_t *out_tracker)
@@ -8374,7 +8398,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    uvm_push_t push;
    NV_STATUS status;
    uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
-    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
+    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, resident_nid);
    uvm_pte_bits_gpu_t pte_bit;
    uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
    uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
@@ -8383,8 +8407,10 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    UVM_ASSERT(map_page_mask);
    UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));

-    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
-        UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));
+    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) {
+        uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
+        UVM_ASSERT(uvm_va_policy_preferred_location_equal(policy, resident_id, policy->preferred_nid));
+    }

    UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
                                  map_page_mask,
@@ -8486,18 +8512,27 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    return uvm_tracker_add_push_safe(out_tracker, &push);
 }

+// allowed_nid_mask is only valid if the CPU is set in allowed_mask.
 static void map_get_allowed_destinations(uvm_va_block_t *block,
                                         uvm_va_block_context_t *va_block_context,
                                         const uvm_va_policy_t *policy,
                                         uvm_processor_id_t id,
-                                         uvm_processor_mask_t *allowed_mask)
+                                         uvm_processor_mask_t *allowed_mask,
+                                         nodemask_t *allowed_nid_mask)
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);

+    *allowed_nid_mask = node_possible_map;
+
    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
        // UVM-Lite can only map resident pages on the preferred location
        uvm_processor_mask_zero(allowed_mask);
        uvm_processor_mask_set(allowed_mask, policy->preferred_location);
+        if (UVM_ID_IS_CPU(policy->preferred_location) &&
+            !uvm_va_policy_preferred_location_equal(policy, UVM_ID_CPU, NUMA_NO_NODE)) {
+            nodes_clear(*allowed_nid_mask);
+            node_set(policy->preferred_nid, *allowed_nid_mask);
+        }
    }
    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
              (uvm_id_equal(policy->preferred_location, id) &&
@@ -8540,6 +8575,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
    NV_STATUS status = NV_OK;
    const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
+    nodemask_t *allowed_nid_destinations;

    va_block_context->mapping.cause = cause;

@@ -8589,10 +8625,20 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    if (!allowed_destinations)
        return NV_ERR_NO_MEMORY;

+    allowed_nid_destinations = uvm_kvmalloc(sizeof(*allowed_nid_destinations));
+    if (!allowed_nid_destinations) {
+        uvm_processor_mask_cache_free(allowed_destinations);
+        return NV_ERR_NO_MEMORY;
+    }
+
    // Map per resident location so we can more easily detect physically-
    // contiguous mappings.
-    map_get_allowed_destinations(va_block, va_block_context, policy, id, allowed_destinations);
-
+    map_get_allowed_destinations(va_block,
+                                 va_block_context,
+                                 policy,
+                                 id,
+                                 allowed_destinations,
+                                 allowed_nid_destinations);
    for_each_closest_id(resident_id, allowed_destinations, id, va_space) {
        if (UVM_ID_IS_CPU(id)) {
            status = block_map_cpu_to(va_block,
@@ -8603,11 +8649,30 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
                                      new_prot,
                                      out_tracker);
        }
+        else if (UVM_ID_IS_CPU(resident_id)) {
+            int nid;
+
+            // map_get_allowed_distinations() will set the mask of CPU NUMA
+            // nodes that should be mapped.
+            for_each_node_mask(nid, *allowed_nid_destinations) {
+                status = block_map_gpu_to(va_block,
+                                          va_block_context,
+                                          gpu,
+                                          resident_id,
+                                          nid,
+                                          running_page_mask,
+                                          new_prot,
+                                          out_tracker);
+                if (status != NV_OK)
+                    break;
+            }
+        }
        else {
            status = block_map_gpu_to(va_block,
                                      va_block_context,
                                      gpu,
                                      resident_id,
+                                      NUMA_NO_NODE,
                                      running_page_mask,
                                      new_prot,
                                      out_tracker);
@@ -8622,6 +8687,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    }

    uvm_processor_mask_cache_free(allowed_destinations);
+    uvm_kvfree(allowed_nid_destinations);

    return status;
 }
@@ -11175,8 +11241,8 @@ NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
    // so uvm_va_block_map will be a no-op.
    uvm_processor_mask_and(map_uvm_lite_gpus, map_other_processors, block_get_uvm_lite_gpus(va_block));
    if (!uvm_processor_mask_empty(map_uvm_lite_gpus) &&
-        uvm_id_equal(new_residency, preferred_location)) {
-        for_each_id_in_mask(map_processor_id, map_uvm_lite_gpus) {
+        uvm_va_policy_preferred_location_equal(policy, new_residency, va_block_context->make_resident.dest_nid)) {
+        for_each_id_in_mask (map_processor_id, map_uvm_lite_gpus) {
            status = uvm_va_block_map(va_block,
                                      va_block_context,
                                      map_processor_id,
@@ -11637,6 +11703,10 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // For GPU faults, the bottom half is pinned to CPUs closest to their GPU.
    // Therefore, in both cases, we can use numa_mem_id() to get the NUMA node
    // ID of the faulting processor.
+    // Note that numa_mem_id() returns the nearest node with memory. In most
+    // cases, this will be the current NUMA node. However, in the case that the
+    // current node does not have any memory, we probably want the nearest node
+    // with memory, anyway.
    int current_nid = numa_mem_id();
    bool may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);

@@ -11660,7 +11730,12 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // If read duplication is enabled and the page is also resident on the CPU,
    // keep its current NUMA node residency.
    if (may_read_duplicate && uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
-        return block_get_page_node_residency(va_block, page_index);
+        return NUMA_NO_NODE;
+
+    // The new_residency processor is the CPU and the preferred location is not
+    // the CPU. If the page is resident on the CPU, keep its current residency.
+    if (uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
+        return NUMA_NO_NODE;

    return current_nid;
 }
@@ -12564,125 +12639,6 @@ NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
        return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block);
 }

-// Launch a synchronous, encrypted copy between GPU and CPU.
-//
-// The copy entails a GPU-side encryption (relying on the Copy Engine), and a
-// CPU-side decryption step, such that the destination CPU buffer pointed by
-// dst_plain will contain the unencrypted (plain text) contents. The destination
-// buffer can be in protected or unprotected sysmem, while the source buffer
-// must be in protected vidmem.
-//
-// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
-//
-// The input tracker, if not NULL, is internally acquired by the push
-// responsible for the encrypted copy.
-__attribute__ ((format(printf, 6, 7)))
-static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
-                                              void *dst_plain,
-                                              uvm_gpu_address_t src_gpu_address,
-                                              size_t size,
-                                              uvm_tracker_t *tracker,
-                                              const char *format,
-                                              ...)
-{
-    NV_STATUS status;
-    UvmCslIv decrypt_iv;
-    uvm_push_t push;
-    uvm_conf_computing_dma_buffer_t *dma_buffer;
-    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
-    void *src_cipher, *auth_tag;
-    va_list args;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
-
-    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
-    if (status != NV_OK)
-        return status;
-
-    va_start(args, format);
-    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
-    va_end(args);
-
-    if (status != NV_OK)
-        goto out;
-
-    uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv);
-
-    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
-    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
-    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
-
-    status = uvm_push_end_and_wait(&push);
-    if (status != NV_OK)
-        goto out;
-
-    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
-    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
-    status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag);
-
- out:
-    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
-    return status;
-}
-
-// Launch a synchronous, encrypted copy between CPU and GPU.
-//
-// The source CPU buffer pointed by src_plain contains the unencrypted (plain
-// text) contents; the function internally performs a CPU-side encryption step
-// before launching the GPU-side CE decryption. The source buffer can be in
-// protected or unprotected sysmem, while the destination buffer must be in
-// protected vidmem.
-//
-// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
-//
-// The input tracker, if not NULL, is internally acquired by the push
-// responsible for the encrypted copy.
-__attribute__ ((format(printf, 6, 7)))
-static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
-                                              uvm_gpu_address_t dst_gpu_address,
-                                              void *src_plain,
-                                              size_t size,
-                                              uvm_tracker_t *tracker,
-                                              const char *format,
-                                              ...)
-{
-    NV_STATUS status;
-    uvm_push_t push;
-    uvm_conf_computing_dma_buffer_t *dma_buffer;
-    uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
-    void *dst_cipher, *auth_tag;
-    va_list args;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
-
-    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
-    if (status != NV_OK)
-        return status;
-
-    va_start(args, format);
-    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
-    va_end(args);
-
-    if (status != NV_OK)
-        goto out;
-
-    dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
-    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
-    uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
-
-    src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
-    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
-    gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
-
-    status = uvm_push_end_and_wait(&push);
-
-out:
-    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
-    return status;
-}
-
 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
                                           uvm_gpu_t *gpu,
                                           uvm_gpu_address_t dst_gpu_address,
@@ -12695,14 +12651,14 @@ static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
    uvm_gpu_address_t src_gpu_address;

    if (g_uvm_global.conf_computing_enabled) {
-        return encrypted_memcopy_cpu_to_gpu(gpu,
-                                            dst_gpu_address,
-                                            uvm_mem_get_cpu_addr_kernel(src_mem),
-                                            size,
-                                            &va_block->tracker,
-                                            "Encrypted write to [0x%llx, 0x%llx)",
-                                            dst,
-                                            dst + size);
+        return uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                          dst_gpu_address,
+                                                          uvm_mem_get_cpu_addr_kernel(src_mem),
+                                                          size,
+                                                          &va_block->tracker,
+                                                          "Encrypted write to [0x%llx, 0x%llx)",
+                                                          dst,
+                                                          dst + size);
    }

    status = uvm_push_begin_acquire(gpu->channel_manager,
@@ -12799,14 +12755,14 @@ static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
    uvm_gpu_address_t dst_gpu_address;

    if (g_uvm_global.conf_computing_enabled) {
-        return encrypted_memcopy_gpu_to_cpu(gpu,
-                                            uvm_mem_get_cpu_addr_kernel(dst_mem),
-                                            src_gpu_address,
-                                            size,
-                                            &va_block->tracker,
-                                            "Encrypted read from [0x%llx, 0x%llx)",
-                                            src,
-                                            src + size);
+        return uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
+                                                          uvm_mem_get_cpu_addr_kernel(dst_mem),
+                                                          src_gpu_address,
+                                                          size,
+                                                          &va_block->tracker,
+                                                          "Encrypted read from [0x%llx, 0x%llx)",
+                                                          src,
+                                                          src + size);
    }

    status = uvm_push_begin_acquire(gpu->channel_manager,
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@@ -105,6 +105,12 @@ bool uvm_va_policy_preferred_location_equal(const uvm_va_policy_t *policy, uvm_p
 {
    bool equal = uvm_id_equal(policy->preferred_location, proc);

+    if (!UVM_ID_IS_CPU(policy->preferred_location))
+        UVM_ASSERT(policy->preferred_nid == NUMA_NO_NODE);
+
+    if (!UVM_ID_IS_CPU(proc))
+        UVM_ASSERT(cpu_numa_id == NUMA_NO_NODE);
+
    if (equal && UVM_ID_IS_CPU(policy->preferred_location))
        equal = uvm_numa_id_eq(policy->preferred_nid, cpu_numa_id);

@@ -656,7 +662,7 @@ const uvm_va_policy_t *uvm_va_policy_set_preferred_location(uvm_va_block_t *va_b
        // and that the policy is changing.
        UVM_ASSERT(node->node.start >= start);
        UVM_ASSERT(node->node.end <= end);
-        UVM_ASSERT(!uvm_id_equal(node->policy.preferred_location, processor_id));
+        UVM_ASSERT(!uvm_va_policy_preferred_location_equal(&node->policy, processor_id, cpu_node_id));
    }

    node->policy.preferred_location = processor_id;
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@@ -868,9 +868,9 @@ static void uvm_va_range_disable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_
        // preferred location. If peer mappings are being disabled to the
        // preferred location, then unmap the other GPU.
        // Nothing to do otherwise.
-        if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu0->id))
+        if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu0->id, NUMA_NO_NODE))
            uvm_lite_gpu_to_unmap = gpu1;
-        else if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu1->id))
+        else if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu1->id, NUMA_NO_NODE))
            uvm_lite_gpu_to_unmap = gpu0;
        else
            return;
@@ -951,7 +951,7 @@ static void va_range_unregister_gpu_managed(uvm_va_range_t *va_range, uvm_gpu_t
    // Reset preferred location and accessed-by of VA ranges if needed
    // Note: ignoring the return code of uvm_va_range_set_preferred_location since this
    // will only return on error when setting a preferred location, not on a reset
-    if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu->id))
+    if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu->id, NUMA_NO_NODE))
        (void)uvm_va_range_set_preferred_location(va_range, UVM_ID_INVALID, NUMA_NO_NODE, mm, NULL);

    uvm_va_range_unset_accessed_by(va_range, gpu->id, NULL);
@@ -1683,7 +1683,7 @@ void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
    // If a UVM-Lite GPU is being removed from the accessed_by mask, it will
    // also stop being a UVM-Lite GPU unless it's also the preferred location.
    if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id) &&
-        !uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, processor_id)) {
+        !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), processor_id, NUMA_NO_NODE)) {
        range_unmap(va_range, processor_id, out_tracker);
    }