535.54.03

2026-01-31 13:39:47 +00:00 · 2023-06-14 12:37:59 -07:00
parent eb5c7665a1
commit 26458140be
120 changed files with 83370 additions and 81507 deletions
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@@ -338,11 +338,6 @@ static NV_STATUS test_memcpy_and_memset_inner(uvm_gpu_t *gpu,
        return NV_OK;
    }

-    if (!gpu->parent->ce_hal->memcopy_is_valid(&push, dst, src)) {
-        TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
-        return NV_OK;
-    }
-
    // The input virtual addresses exist in UVM's internal address space, not
    // the proxy address space
    if (uvm_channel_is_proxy(push.channel)) {
@@ -401,7 +396,7 @@ static NV_STATUS test_memcpy_and_memset_inner(uvm_gpu_t *gpu,
 static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
 {
    NV_STATUS status = NV_OK;
-    bool is_proxy_va_space;
+    bool is_proxy_va_space = false;
    uvm_gpu_address_t gpu_verif_addr;
    void *cpu_verif_addr;
    uvm_mem_t *verif_mem = NULL;
@@ -437,6 +432,34 @@ static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
        }
    }

+    // Virtual address (in UVM's internal address space) backed by sysmem
+    TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &sys_rm_mem), done);
+    gpu_addresses[0] = uvm_rm_mem_get_gpu_va(sys_rm_mem, gpu, is_proxy_va_space);
+
+    if (uvm_conf_computing_mode_enabled(gpu)) {
+        for (i = 0; i < iterations; ++i) {
+            for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
+                TEST_NV_CHECK_GOTO(test_memcpy_and_memset_inner(gpu,
+                                                                gpu_addresses[0],
+                                                                gpu_addresses[0],
+                                                                size,
+                                                                element_sizes[s],
+                                                                gpu_verif_addr,
+                                                                cpu_verif_addr,
+                                                                i),
+                                    done);
+
+            }
+        }
+
+        // Because gpu_verif_addr is in sysmem, when the Confidential
+        // Computing feature is enabled, only the previous cases are valid.
+        // TODO: Bug 3839176: the test partially waived on Confidential
+        // Computing because it assumes that GPU can access system memory
+        // without using encryption.
+        goto done;
+    }
+
    // Using a page size equal to the allocation size ensures that the UVM
    // memories about to be allocated are physically contiguous. And since the
    // size is a valid GPU page size, the memories can be virtually mapped on
@@ -448,37 +471,22 @@ static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
    // Physical address in sysmem
    TEST_NV_CHECK_GOTO(uvm_mem_alloc(&mem_params, &sys_uvm_mem), done);
    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_phys(sys_uvm_mem, gpu), done);
-    gpu_addresses[0] = uvm_mem_gpu_address_physical(sys_uvm_mem, gpu, 0, size);
+    gpu_addresses[1] = uvm_mem_gpu_address_physical(sys_uvm_mem, gpu, 0, size);

    // Physical address in vidmem
    mem_params.backing_gpu = gpu;
    TEST_NV_CHECK_GOTO(uvm_mem_alloc(&mem_params, &gpu_uvm_mem), done);
-    gpu_addresses[1] = uvm_mem_gpu_address_physical(gpu_uvm_mem, gpu, 0, size);
+    gpu_addresses[2] = uvm_mem_gpu_address_physical(gpu_uvm_mem, gpu, 0, size);

    // Virtual address (in UVM's internal address space) backed by vidmem
    TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_GPU, size, 0, &gpu_rm_mem), done);
-    is_proxy_va_space = false;
-    gpu_addresses[2] = uvm_rm_mem_get_gpu_va(gpu_rm_mem, gpu, is_proxy_va_space);
+    gpu_addresses[3] = uvm_rm_mem_get_gpu_va(gpu_rm_mem, gpu, is_proxy_va_space);

-    // Virtual address (in UVM's internal address space) backed by sysmem
-    TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &sys_rm_mem), done);
-    gpu_addresses[3] = uvm_rm_mem_get_gpu_va(sys_rm_mem, gpu, is_proxy_va_space);

    for (i = 0; i < iterations; ++i) {
        for (j = 0; j < ARRAY_SIZE(gpu_addresses); ++j) {
            for (k = 0; k < ARRAY_SIZE(gpu_addresses); ++k) {
                for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
-                  // Because gpu_verif_addr is in sysmem, when the Confidential
-                  // Computing feature is enabled, only the following cases are
-                  // valid.
-                  //
-                  // TODO: Bug 3839176: the test partially waived on
-                  // Confidential Computing because it assumes that GPU can
-                  // access system memory without using encryption.
-                  if (uvm_conf_computing_mode_enabled(gpu) &&
-                      !(gpu_addresses[k].is_unprotected && gpu_addresses[j].is_unprotected)) {
-                        continue;
-                  }
                    TEST_NV_CHECK_GOTO(test_memcpy_and_memset_inner(gpu,
                                                                    gpu_addresses[k],
                                                                    gpu_addresses[j],
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -750,9 +750,9 @@ static void internal_channel_submit_work_wlc(uvm_push_t *push)
                       wlc_channel->channel_info.workSubmissionToken);
 }

-static NV_STATUS internal_channel_submit_work_indirect(uvm_push_t *push,
-                                                       NvU32 old_cpu_put,
-                                                       NvU32 new_gpu_put)
+static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push,
+                                                      NvU32 old_cpu_put,
+                                                      NvU32 new_gpu_put)
 {
    uvm_pushbuffer_t *pushbuffer = push->channel->pool->manager->pushbuffer;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
@@ -765,10 +765,211 @@ static NV_STATUS internal_channel_submit_work_indirect(uvm_push_t *push,
    NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
    void *push_enc_auth_tag;
    uvm_gpu_address_t push_enc_auth_tag_gpu;
-    NvU64 gpfifo_gpu = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry);
+    NvU64 gpfifo_gpu_va = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry);
+
+    UVM_ASSERT(!uvm_channel_is_sec2(push->channel));
+    UVM_ASSERT(uvm_channel_is_wlc(push->launch_channel));
+
+    // WLC submissions are done under channel lock, so there should be no
+    // contention to get the right submission order.
+    UVM_ASSERT(push->channel->conf_computing.gpu_put == old_cpu_put);
+
+    // This can never stall or return error. WLC launch after WLC channels are
+    // initialized uses private static pb space and it neither needs the general
+    // PB space, nor it counts towards max concurrent pushes.
+    status = uvm_push_begin_on_reserved_channel(push->launch_channel,
+                                                &indirect_push,
+                                                "Worklaunch to '%s' via '%s'",
+                                                push->channel->name,
+                                                push->launch_channel->name);
+    UVM_ASSERT(status == NV_OK);
+
+
+    // Move over the pushbuffer data
+    // WLC channels use a static preallocated space for launch auth tags
+    push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu;
+    push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va);
+
+    uvm_conf_computing_cpu_encrypt(indirect_push.channel,
+                                   push_enc_cpu,
+                                   push->begin,
+                                   NULL,
+                                   uvm_push_get_size(push),
+                                   push_enc_auth_tag);
+
+    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+
+    gpu->parent->ce_hal->decrypt(&indirect_push,
+                                 uvm_gpu_address_virtual(uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push)),
+                                 uvm_gpu_address_virtual(push_enc_gpu),
+                                 uvm_push_get_size(push),
+                                 push_enc_auth_tag_gpu);
+
+    gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_entry,
+                                            uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
+                                            uvm_push_get_size(push),
+                                            UVM_GPFIFO_SYNC_PROCEED);
+
+    gpu->parent->ce_hal->memset_8(&indirect_push,
+                                  uvm_gpu_address_virtual(gpfifo_gpu_va),
+                                  gpfifo_entry,
+                                  sizeof(gpfifo_entry));
+
+    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+    do_semaphore_release(&indirect_push, push->channel->channel_info.gpPutGpuVa, new_gpu_put);
+
+    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+    do_semaphore_release(&indirect_push,
+                         push->channel->channel_info.workSubmissionOffsetGpuVa,
+                         push->channel->channel_info.workSubmissionToken);
+
+    // Ignore return value of push_wait. It can only fail with channel error
+    // which will be detected when waiting for the primary push.
+    (void)uvm_push_end_and_wait(&indirect_push);
+
+    push->channel->conf_computing.gpu_put = new_gpu_put;
+}
+
+static void update_gpput_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel, NvU32 new_gpu_put)
+{
+    uvm_gpu_t *gpu = uvm_push_get_gpu(sec2_push);
+    void *gpput_auth_tag_cpu, *gpput_enc_cpu;
+    uvm_gpu_address_t gpput_auth_tag_gpu, gpput_enc_gpu;
+    NvU32 gpput_scratchpad[UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT/sizeof(new_gpu_put)];
+
+    UVM_ASSERT(uvm_channel_is_sec2(sec2_push->channel));
+
+    gpput_enc_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
+                                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
+                                                      &gpput_enc_gpu);
+    gpput_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                           UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                                           UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                                           &gpput_auth_tag_gpu);
+
+
+
+    // Update GPPUT. The update needs 4B write to specific offset,
+    // however we can only do 16B aligned decrypt writes.
+    // A poison value is written to all other locations, this is ignored in
+    // most locations and overwritten by HW for GPGET location
+    memset(gpput_scratchpad, 0, sizeof(gpput_scratchpad));
+    UVM_ASSERT(sizeof(*gpput_scratchpad) == sizeof(new_gpu_put));
+    gpput_scratchpad[(channel->channel_info.gpPutGpuVa % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT) /
+                     sizeof(*gpput_scratchpad)] = new_gpu_put;
+
+    // Set value of GPGET to be the same as GPPUT. It will be overwritten by
+    // HW next time GET value changes. UVM never reads GPGET.
+    // However, RM does read it when freeing a channel. When this function
+    // is called from 'channel_manager_stop_wlc' we set the value of GPGET
+    // to the same value as GPPUT. Mismatch between these two values makes
+    // RM wait for any "pending" tasks, leading to significant delays in the
+    // channel teardown sequence.
+    UVM_ASSERT(channel->channel_info.gpPutGpuVa / UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT ==
+               channel->channel_info.gpGetGpuVa / UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
+    gpput_scratchpad[(channel->channel_info.gpGetGpuVa % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT) /
+                     sizeof(*gpput_scratchpad)] = new_gpu_put;
+
+    uvm_conf_computing_cpu_encrypt(sec2_push->channel,
+                                   gpput_enc_cpu,
+                                   gpput_scratchpad,
+                                   NULL,
+                                   sizeof(gpput_scratchpad),
+                                   gpput_auth_tag_cpu);
+    gpu->parent->sec2_hal->decrypt(sec2_push,
+                                   UVM_ALIGN_DOWN(channel->channel_info.gpPutGpuVa,
+                                                  UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT),
+                                   gpput_enc_gpu.address,
+                                   sizeof(gpput_scratchpad),
+                                   gpput_auth_tag_gpu.address);
+}
+
+static void set_gpfifo_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel, NvU32 put, NvU64 value)
+{
+    uvm_gpu_t *gpu = uvm_push_get_gpu(sec2_push);
+    void *gpfifo_auth_tag_cpu, *gpfifo_enc_cpu;
+    uvm_gpu_address_t gpfifo_auth_tag_gpu, gpfifo_enc_gpu;
+    NvU64 gpfifo_gpu = channel->channel_info.gpFifoGpuVa + put * sizeof(value);
+    NvU64 gpfifo_scratchpad[2];
+
+    UVM_ASSERT(uvm_channel_is_sec2(sec2_push->channel));
+
+    gpfifo_enc_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                       sizeof(gpfifo_scratchpad),
+                                                       UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
+                                                       &gpfifo_enc_gpu);
+    gpfifo_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                            UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                                            UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                                            &gpfifo_auth_tag_gpu);
+
+    if (IS_ALIGNED(gpfifo_gpu, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT)) {
+        gpfifo_scratchpad[0] = value;
+
+        // Set the value of the odd entry to noop.
+        // It will be overwritten when the next entry is submitted.
+        gpu->parent->host_hal->set_gpfifo_noop(&gpfifo_scratchpad[1]);
+    }
+    else {
+        uvm_gpfifo_entry_t *previous_gpfifo;
+
+        UVM_ASSERT(put > 0);
+
+        previous_gpfifo = &channel->gpfifo_entries[put - 1];
+
+        if (previous_gpfifo->type ==  UVM_GPFIFO_ENTRY_TYPE_CONTROL) {
+            gpfifo_scratchpad[0] = previous_gpfifo->control_value;
+        }
+        else {
+            uvm_pushbuffer_t *pushbuffer = channel->pool->manager->pushbuffer;
+            NvU64 prev_pb_va = uvm_pushbuffer_get_gpu_va_base(pushbuffer) + previous_gpfifo->pushbuffer_offset;
+
+            // Reconstruct the previous gpfifo entry. UVM_GPFIFO_SYNC_WAIT is
+            // used only in static WLC schedule.
+            // Overwriting the previous entry with the same value doesn't hurt,
+            // whether the previous entry has been processed or not
+            gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_scratchpad[0],
+                                                    prev_pb_va,
+                                                    previous_gpfifo->pushbuffer_size,
+                                                    UVM_GPFIFO_SYNC_PROCEED);
+        }
+
+        gpfifo_scratchpad[1] = value;
+    }
+
+    uvm_conf_computing_cpu_encrypt(sec2_push->channel,
+                                   gpfifo_enc_cpu,
+                                   gpfifo_scratchpad,
+                                   NULL,
+                                   sizeof(gpfifo_scratchpad),
+                                   gpfifo_auth_tag_cpu);
+    gpu->parent->sec2_hal->decrypt(sec2_push,
+                                   UVM_ALIGN_DOWN(gpfifo_gpu, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT),
+                                   gpfifo_enc_gpu.address,
+                                   sizeof(gpfifo_scratchpad),
+                                   gpfifo_auth_tag_gpu.address);
+}
+
+static NV_STATUS internal_channel_submit_work_indirect_sec2(uvm_push_t *push,
+                                                            NvU32 old_cpu_put,
+                                                            NvU32 new_gpu_put)
+{
+    uvm_pushbuffer_t *pushbuffer = push->channel->pool->manager->pushbuffer;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    uvm_push_t indirect_push;
+    NV_STATUS status;
+    NvU64 gpfifo_entry;
+
+    void *push_enc_cpu = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push);
+    NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
+    void *push_auth_tag_cpu;
+    uvm_gpu_address_t push_auth_tag_gpu;
    uvm_spin_loop_t spin;

    UVM_ASSERT(!uvm_channel_is_sec2(push->channel));
+    UVM_ASSERT(uvm_channel_is_sec2(push->launch_channel));

    // If the old_cpu_put is not equal to the last gpu put, other pushes are
    // pending that need to be submitted. That push/es' submission will update
@@ -790,60 +991,36 @@ static NV_STATUS internal_channel_submit_work_indirect(uvm_push_t *push,


    // Move over the pushbuffer data
-    if (uvm_channel_is_sec2(indirect_push.channel)) {
-        push_enc_auth_tag = uvm_push_get_single_inline_buffer(&indirect_push,
-                                                              UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                              UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                              &push_enc_auth_tag_gpu);
-    }
-    else {
-        // Auth tags cannot be in protected vidmem.
-        // WLC channels use a static preallocated space for launch auth tags
-        push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu;
-        push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va);
-    }
+    push_auth_tag_cpu = uvm_push_get_single_inline_buffer(&indirect_push,
+                                                          UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                                          &push_auth_tag_gpu);

    uvm_conf_computing_cpu_encrypt(indirect_push.channel,
                                   push_enc_cpu,
                                   push->begin,
                                   NULL,
                                   uvm_push_get_size(push),
-                                   push_enc_auth_tag);
+                                   push_auth_tag_cpu);

    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);

-    if (uvm_channel_is_sec2(indirect_push.channel)) {
-        gpu->parent->sec2_hal->decrypt(&indirect_push,
-                                       uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
-                                       push_enc_gpu,
-                                       uvm_push_get_size(push),
-                                       push_enc_auth_tag_gpu.address);
-    }
-    else {
-        gpu->parent->ce_hal->decrypt(&indirect_push,
-                                     uvm_gpu_address_virtual(uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push)),
-                                     uvm_gpu_address_virtual(push_enc_gpu),
-                                     uvm_push_get_size(push),
-                                     push_enc_auth_tag_gpu);
-    }
+    gpu->parent->sec2_hal->decrypt(&indirect_push,
+                                   uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
+                                   push_enc_gpu,
+                                   uvm_push_get_size(push),
+                                   push_auth_tag_gpu.address);

    gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_entry,
                                            uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
                                            uvm_push_get_size(push),
                                            UVM_GPFIFO_SYNC_PROCEED);

-    // TODO: Bug 2649842: RFE - Investigate using 64-bit semaphore
-    // SEC2 needs encrypt decrypt to be 16B aligned GPFIFO entries are only 8B
-    // Use 2x semaphore release to set the values directly.
-    // We could use a single 64 bit release if it were available
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu, NvU64_LO32(gpfifo_entry));
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu + 4, NvU64_HI32(gpfifo_entry));

-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
-    do_semaphore_release(&indirect_push, push->channel->channel_info.gpPutGpuVa, new_gpu_put);
+    set_gpfifo_via_sec2(&indirect_push, push->channel, old_cpu_put, gpfifo_entry);
+    update_gpput_via_sec2(&indirect_push, push->channel, new_gpu_put);

+    // Ring the doorbell
    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
    do_semaphore_release(&indirect_push,
                         push->channel->channel_info.workSubmissionOffsetGpuVa,
@@ -930,11 +1107,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    }
    else if (uvm_conf_computing_mode_enabled(channel_manager->gpu) && !uvm_channel_is_sec2(channel)) {
        if (uvm_channel_manager_is_wlc_ready(channel_manager)) {
-            NV_STATUS status = internal_channel_submit_work_indirect(push, cpu_put, new_cpu_put);
-
-            // This codepath should only be used during initialization and thus
-            // NEVER return an error.
-            UVM_ASSERT(status == NV_OK);
+            internal_channel_submit_work_indirect_wlc(push, cpu_put, new_cpu_put);
        }
        else {
            // submitting via SEC2 starts a push, postpone until this push is ended
@@ -963,7 +1136,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    wmb();

    if (needs_sec2_work_submit) {
-        NV_STATUS status = internal_channel_submit_work_indirect(push, cpu_put, new_cpu_put);
+        NV_STATUS status = internal_channel_submit_work_indirect_sec2(push, cpu_put, new_cpu_put);

        // This codepath should only be used during initialization and thus
        // NEVER return an error.
@@ -1007,7 +1180,6 @@ static NV_STATUS submit_ctrl_gpfifo_indirect(uvm_channel_t *channel,
    uvm_channel_type_t indirect_channel_type = uvm_channel_manager_is_wlc_ready(channel->pool->manager) ?
                                               UVM_CHANNEL_TYPE_WLC :
                                               UVM_CHANNEL_TYPE_SEC2;
-    NvU64 gpfifo_gpu = channel->channel_info.gpFifoGpuVa + (old_cpu_put * sizeof(entry->control_value));

    UVM_ASSERT(!uvm_channel_is_sec2(channel));

@@ -1026,17 +1198,26 @@ static NV_STATUS submit_ctrl_gpfifo_indirect(uvm_channel_t *channel,
    if (status != NV_OK)
        return status;

-    // TODO: Bug 2649842: RFE - Investigate using 64-bit semaphore
-    // SEC2 needs encrypt decrypt to be 16B aligned GPFIFO entries are only 8B
-    // Use 2x semaphore release to set the values directly.
-    // One 64bit semahore release can be used instead once implemented.
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu, NvU64_LO32(entry->control_value));
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu + 4,  NvU64_HI32(entry->control_value));
+    if (uvm_channel_is_sec2(indirect_push.channel)) {
+        set_gpfifo_via_sec2(&indirect_push, channel, old_cpu_put, entry->control_value);
+        update_gpput_via_sec2(&indirect_push, channel, new_gpu_put);
+    } else {
+        uvm_gpu_t *gpu = uvm_push_get_gpu(&indirect_push);
+        NvU64 gpfifo_gpu_va = channel->channel_info.gpFifoGpuVa + (old_cpu_put * sizeof(entry->control_value));
+
+        gpu->parent->ce_hal->memset_8(&indirect_push,
+                                      uvm_gpu_address_virtual(gpfifo_gpu_va),
+                                      entry->control_value,
+                                      sizeof(entry->control_value));
+
+        uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+        do_semaphore_release(&indirect_push, channel->channel_info.gpPutGpuVa, new_gpu_put);
+    }

    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
-    do_semaphore_release(&indirect_push, channel->channel_info.gpPutGpuVa, new_gpu_put);
+    do_semaphore_release(&indirect_push,
+                         channel->channel_info.workSubmissionOffsetGpuVa,
+                         channel->channel_info.workSubmissionToken);

    status = uvm_push_end_and_wait(&indirect_push);
    if (status != NV_OK)
@@ -1044,9 +1225,6 @@ static NV_STATUS submit_ctrl_gpfifo_indirect(uvm_channel_t *channel,

    channel->conf_computing.gpu_put = new_gpu_put;

-    // The above SEC2 work transferred everything
-    // Ring the doorbell
-    UVM_GPU_WRITE_ONCE(*channel->channel_info.workSubmissionOffset, channel->channel_info.workSubmissionToken);
    return NV_OK;
 }

@@ -1445,17 +1623,21 @@ static NV_STATUS alloc_conf_computing_buffers_semaphore(uvm_channel_t *channel)
 static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
 {
    uvm_gpu_t *gpu = channel->pool->manager->gpu;
+    size_t aligned_wlc_push_size = UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
    NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
                                                    UVM_RM_MEM_TYPE_SYS,
-                                                    UVM_MAX_WLC_PUSH_SIZE + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2,
+                                                    aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2,
                                                    PAGE_SIZE,
                                                    &channel->conf_computing.static_pb_unprotected_sysmem);
    if (status != NV_OK)
        return status;

+    // Both pushes will be targets for SEC2 decrypt operations and have to
+    // be aligned for SEC2. The first push location will also be a target
+    // for CE decrypt operation and has to be aligned for CE decrypt.
    status = uvm_rm_mem_alloc(gpu,
                              UVM_RM_MEM_TYPE_GPU,
-                              UVM_MAX_WLC_PUSH_SIZE * 2,
+                              UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT) * 2,
                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
                              &channel->conf_computing.static_pb_protected_vidmem);
    if (status != NV_OK)
@@ -1464,16 +1646,16 @@ static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
    channel->conf_computing.static_pb_unprotected_sysmem_cpu =
        uvm_rm_mem_get_cpu_va(channel->conf_computing.static_pb_unprotected_sysmem);
    channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu =
-        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + UVM_MAX_WLC_PUSH_SIZE;
+        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + aligned_wlc_push_size;

    // The location below is only used for launch pushes but reuses
    // the same sysmem allocation
    channel->conf_computing.launch_auth_tag_cpu =
        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu +
-        UVM_MAX_WLC_PUSH_SIZE + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
+        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    channel->conf_computing.launch_auth_tag_gpu_va =
        uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_unprotected_sysmem, gpu) +
-        UVM_MAX_WLC_PUSH_SIZE + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
+        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;

    channel->conf_computing.static_pb_protected_sysmem = uvm_kvmalloc(UVM_MAX_WLC_PUSH_SIZE + UVM_PAGE_SIZE_4K);
    if (!channel->conf_computing.static_pb_protected_sysmem)
@@ -2576,7 +2758,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    // "decrypt_push" represents WLC decrypt push, constructed using fake_push.
    // Copied to wlc_pb_base + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
    // pushes that make the WLC fixed schedule.
-    NvU64 decrypt_push_protected_gpu = protected_vidmem + UVM_MAX_WLC_PUSH_SIZE;
+    NvU64 decrypt_push_protected_gpu = UVM_ALIGN_UP(protected_vidmem + UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT);
    NvU64 decrypt_push_unprotected_gpu = unprotected_sysmem_gpu + gpfifo_size;
    void *decrypt_push_unprotected_cpu = (char*)gpfifo_unprotected_cpu + gpfifo_size;

@@ -2587,7 +2769,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    BUILD_BUG_ON(sizeof(*wlc_gpfifo_entries) != sizeof(*wlc->channel_info.gpFifoEntries));

    UVM_ASSERT(uvm_channel_is_wlc(wlc));
-    UVM_ASSERT(tag_offset == UVM_MAX_WLC_PUSH_SIZE);
+    UVM_ASSERT(tag_offset == UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));

    // WLC schedule consists of two parts, the number of entries needs to be even.
    // This also guarantees that the size is 16B aligned
@@ -2692,11 +2874,9 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)

    // Prime the WLC by setting "PUT" two steps ahead. Reuse the current
    // cpu_put value that was used during channel initialization.
-    // Don't update wlc->cpu_put, it will be used to track
-    // submitted pushes as any other channel.
-    do_semaphore_release(&sec2_push,
-                         wlc->channel_info.gpPutGpuVa,
-                         (wlc->cpu_put + 2) % wlc->num_gpfifo_entries);
+    // Don't update wlc->cpu_put, it will be used to track submitted pushes
+    // as any other channel.
+    update_gpput_via_sec2(&sec2_push, wlc, (wlc->cpu_put + 2) % wlc->num_gpfifo_entries);

    status = uvm_push_end_and_wait(&sec2_push);

@@ -3048,9 +3228,7 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager)
        // Every gpfifo entry advances the gpu put of WLC by two so the current
        // value is: (cpu_put * 2) % num_gpfifo_entries and it's ahead of the
        // get pointer by 2.
-        do_semaphore_release(&push,
-                             channel->channel_info.gpPutGpuVa,
-                             (channel->cpu_put * 2 - 2) % channel->num_gpfifo_entries);
+        update_gpput_via_sec2(&push, channel, (channel->cpu_put * 2 - 2) % channel->num_gpfifo_entries);
    }

    status = uvm_push_end_and_wait(&push);
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@@ -378,11 +378,12 @@ void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
    NV_STATUS status;

    uvm_mutex_lock(&channel->csl.ctx_lock);
-    status = nvUvmInterfaceCslLogDeviceEncryption(&channel->csl.ctx, iv);
+    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, 1, iv);
    uvm_mutex_unlock(&channel->csl.ctx_lock);

-    // nvUvmInterfaceLogDeviceEncryption fails when a 64-bit encryption counter
-    // overflows. This is not supposed to happen on CC.
+    // TODO: Bug 4014720: If nvUvmInterfaceCslIncrementIv returns with
+    // NV_ERR_INSUFFICIENT_RESOURCES then the IV needs to be rotated via
+    // nvUvmInterfaceCslRotateIv.
    UVM_ASSERT(status == NV_OK);
 }

@@ -391,11 +392,12 @@ void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *
    NV_STATUS status;

    uvm_mutex_lock(&channel->csl.ctx_lock);
-    status = nvUvmInterfaceCslAcquireEncryptionIv(&channel->csl.ctx, iv);
+    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_ENCRYPT, 1, iv);
    uvm_mutex_unlock(&channel->csl.ctx_lock);

-    // nvUvmInterfaceLogDeviceEncryption fails when a 64-bit encryption counter
-    // overflows. This is not supposed to happen on CC.
+    // TODO: Bug 4014720: If nvUvmInterfaceCslIncrementIv returns with
+    // NV_ERR_INSUFFICIENT_RESOURCES then the IV needs to be rotated via
+    // nvUvmInterfaceCslRotateIv.
    UVM_ASSERT(status == NV_OK);
 }

@@ -439,6 +441,8 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                      (const NvU8 *) src_cipher,
                                      src_iv,
                                      (NvU8 *) dst_plain,
+                                      NULL,
+                                      0,
                                      (const NvU8 *) auth_tag_buffer);
    uvm_mutex_unlock(&channel->csl.ctx_lock);

--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@@ -42,9 +42,11 @@
 // Use sizeof(UvmCslIv) to refer to the IV size.
 #define UVM_CONF_COMPUTING_IV_ALIGNMENT 16

-// SEC2 decrypt operation buffers are required to be 16-bytes aligned. CE
-// encrypt/decrypt can be unaligned if the buffer lies in a single 32B segment.
-// Otherwise, they need to be 32B aligned.
+// SEC2 decrypt operation buffers are required to be 16-bytes aligned.
+#define UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT 16
+
+// CE encrypt/decrypt can be unaligned if the entire buffer lies in a single
+// 32B segment. Otherwise, it needs to be 32B aligned.
 #define UVM_CONF_COMPUTING_BUF_ALIGNMENT 32

 #define UVM_CONF_COMPUTING_DMA_BUFFER_SIZE UVM_VA_BLOCK_SIZE
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -2575,7 +2575,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
                continue;
            }

-            if (folio_test_swapcache(page_folio(src_page))) {
+            if (PageSwapCache(src_page)) {
                // TODO: Bug 4050579: Remove this when swap cached pages can be
                // migrated.
                if (service_context) {
--- a/kernel-open/nvidia-uvm/uvm_hopper_sec2.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_sec2.c
@@ -166,6 +166,7 @@ void uvm_hal_hopper_sec2_decrypt(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, N
    NvU32 *csl_sign_init = push->next;

    // Check that the provided alignment matches HW
+    BUILD_BUG_ON(UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT != (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)));
    BUILD_BUG_ON(UVM_CONF_COMPUTING_BUF_ALIGNMENT < (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)));
    BUILD_BUG_ON(UVM_CONF_COMPUTING_BUF_ALIGNMENT % (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)) != 0);

--- a/kernel-open/nvidia-uvm/uvm_pushbuffer.h
+++ b/kernel-open/nvidia-uvm/uvm_pushbuffer.h
@@ -161,22 +161,22 @@
 // * WFI:                     8B
 // Total:                    64B
 //
-// Push space needed for secure work launch is 224B. The push is constructed
+// Push space needed for secure work launch is 364B. The push is constructed
 // in 'internal_channel_submit_work_indirect' and 'uvm_channel_end_push'
 // * CE decrypt (of indirect PB):                   56B
-// * 2*semaphore release (indirect GPFIFO entry): 2*24B
+// * memset_8 (indirect GPFIFO entry):              44B
 // * semaphore release (indirect GPPUT):            24B
 // * semaphore release (indirect doorbell):         24B
 // Appendix added in 'uvm_channel_end_push':
 // * semaphore release (WLC tracking):             168B
-//      * semaphore increment (memcopy):            24B
+//      * semaphore release (payload):              24B
 //      * notifier memset:                          40B
 //      * payload encryption:                       64B
 //      * notifier memset:                          40B
 // * semaphore increment (LCIC GPPUT):              24B
 // * semaphore release (LCIC doorbell):             24B
-// Total:                                          368B
-#define UVM_MAX_WLC_PUSH_SIZE (368)
+// Total:                                          364B
+#define UVM_MAX_WLC_PUSH_SIZE (364)

 // Push space needed for static LCIC schedule, as initialized in
 // 'setup_lcic_schedule':
@@ -184,7 +184,7 @@
 // * semaphore increment (WLC GPPUT):      24B
 // * semaphore increment (WLC GPPUT):      24B
 // * semaphore increment (LCIC tracking): 160B
-//      * semaphore increment (memcopy):   24B
+//      * semaphore increment (payload):   24B
 //      * notifier memcopy:                36B
 //      * payload encryption:              64B
 //      * notifier memcopy:                36B
--- a/kernel-open/nvidia-uvm/uvm_sec2_test.c
+++ b/kernel-open/nvidia-uvm/uvm_sec2_test.c
@@ -213,6 +213,7 @@ done:
 typedef enum
 {
    MEM_ALLOC_TYPE_SYSMEM_DMA,
+    MEM_ALLOC_TYPE_SYSMEM_PROTECTED,
    MEM_ALLOC_TYPE_VIDMEM_PROTECTED
 } mem_alloc_type_t;

@@ -274,7 +275,11 @@ static NV_STATUS alloc_and_init_mem(uvm_gpu_t *gpu, uvm_mem_t **mem, size_t size
        TEST_NV_CHECK_GOTO(ce_memset_gpu(gpu, *mem, size, 0xdead), err);
    }
    else {
-        TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, NULL, mem));
+        if (type == MEM_ALLOC_TYPE_SYSMEM_DMA)
+            TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, NULL, mem));
+        else
+            TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem(size, NULL, mem));
+
        TEST_NV_CHECK_GOTO(uvm_mem_map_cpu_kernel(*mem), err);
        TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
        write_range_cpu(*mem, size, 0xdeaddead);
@@ -405,48 +410,6 @@ static void gpu_decrypt(uvm_push_t *push,
    }
 }

-// This test only uses sysmem so that we can use the CPU for encryption and SEC2
-// for decryption, i.e., the test doesn't depend on any other GPU engine for
-// the encryption operation (refer to test_cpu_to_gpu_roundtrip()). This is not
-// how SEC2 is used in the driver. The intended SEC2 usage is to decrypt from
-// unprotected sysmem to protected vidmem, which is tested in
-// test_cpu_to_gpu_roundtrip().
-static NV_STATUS test_cpu_to_gpu_sysmem(uvm_gpu_t *gpu, size_t copy_size, size_t size)
-{
-    NV_STATUS status = NV_OK;
-    uvm_mem_t *src_plain = NULL;
-    uvm_mem_t *cipher = NULL;
-    uvm_mem_t *dst_plain = NULL;
-    uvm_mem_t *auth_tag_mem = NULL;
-    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-    uvm_push_t push;
-
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &auth_tag_mem, auth_tag_buffer_size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-
-    write_range_cpu(src_plain, size, uvm_get_stale_thread_id());
-    write_range_cpu(dst_plain, size, 0xA5A5A5A5);
-
-    TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_SEC2, &push, "enc(cpu)_dec(gpu)"), out);
-
-    cpu_encrypt(push.channel, cipher, src_plain, auth_tag_mem, size, copy_size);
-    gpu_decrypt(&push, dst_plain, cipher, auth_tag_mem, size, copy_size);
-
-    uvm_push_end_and_wait(&push);
-
-    TEST_CHECK_GOTO(mem_match(src_plain, dst_plain), out);
-
-out:
-    uvm_mem_free(auth_tag_mem);
-    uvm_mem_free(cipher);
-    uvm_mem_free(dst_plain);
-    uvm_mem_free(src_plain);
-
-    return status;
-}
-
 // This test depends on the CE for the encryption, so we assume tests from
 // uvm_ce_test.c have successfully passed.
 static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, size_t size)
@@ -461,19 +424,16 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    uvm_push_t push;
    UvmCslIv *decrypt_iv;
-    uvm_tracker_t tracker;

    decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
    if (!decrypt_iv)
        return NV_ERR_NO_MEMORY;

-    uvm_tracker_init(&tracker);
-
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
+    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_PROTECTED), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain, size, MEM_ALLOC_TYPE_VIDMEM_PROTECTED), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain_cpu, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
+    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain_cpu, size, MEM_ALLOC_TYPE_SYSMEM_PROTECTED), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &auth_tag_mem, auth_tag_buffer_size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);

    write_range_cpu(src_plain, size, uvm_get_stale_thread_id());
@@ -483,14 +443,13 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
    cpu_encrypt(push.channel, src_cipher, src_plain, auth_tag_mem, size, copy_size);
    gpu_decrypt(&push, dst_plain, src_cipher, auth_tag_mem, size, copy_size);

-    uvm_push_end(&push);
-    TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), out);

-    TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
-                                              UVM_CHANNEL_TYPE_GPU_TO_CPU,
-                                              &tracker,
-                                              &push,
-                                              "enc(gpu)_dec(cpu)"),
+    // Wait for SEC2 before launching the CE part.
+    // SEC2 is only allowed to release semaphores in unprotected sysmem,
+    // and CE can only acquire semaphores in protected vidmem.
+    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);
+
+    TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "enc(gpu)_dec(cpu)"),
                       out);

    gpu_encrypt(&push, dst_cipher, dst_plain, decrypt_iv, auth_tag_mem, size, copy_size);
@@ -521,8 +480,6 @@ out:

    uvm_kvfree(decrypt_iv);

-    uvm_tracker_deinit(&tracker);
-
    return status;
 }

@@ -545,7 +502,6 @@ static NV_STATUS test_encryption_decryption(uvm_gpu_t *gpu)

        UVM_ASSERT(size % copy_sizes[i] == 0);

-        TEST_NV_CHECK_RET(test_cpu_to_gpu_sysmem(gpu, copy_sizes[i], size));
        TEST_NV_CHECK_RET(test_cpu_to_gpu_roundtrip(gpu, copy_sizes[i], size));
    }

--- a/kernel-open/nvidia-uvm/uvm_tracker_test.c
+++ b/kernel-open/nvidia-uvm/uvm_tracker_test.c
@@ -69,6 +69,14 @@ static NV_STATUS test_tracker_completion(uvm_va_space_t *va_space)
    gpu = uvm_va_space_find_first_gpu(va_space);
    TEST_CHECK_RET(gpu != NULL);

+    // TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore mechanism
+    //                     to all semaphore
+    // This test allocates semaphore in vidmem and then releases it from the CPU
+    // SEC2 channels cannot target semaphores in vidmem. Moreover, CPU cannot
+    // directly release values to vidmem for CE channels.
+    if (uvm_conf_computing_mode_enabled(gpu))
+        return NV_OK;
+
    TEST_NV_CHECK_RET(uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema));

    uvm_tracker_init(&tracker);
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -7189,6 +7189,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
 }

 static void map_get_allowed_destinations(uvm_va_block_t *block,
+                                         uvm_va_block_context_t *va_block_context,
                                         const uvm_va_policy_t *policy,
                                         uvm_processor_id_t id,
                                         uvm_processor_mask_t *allowed_mask)
@@ -7200,7 +7201,10 @@ static void map_get_allowed_destinations(uvm_va_block_t *block,
        uvm_processor_mask_zero(allowed_mask);
        uvm_processor_mask_set(allowed_mask, policy->preferred_location);
    }
-    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) || uvm_id_equal(policy->preferred_location, id)) &&
+    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
+              (uvm_id_equal(policy->preferred_location, id) &&
+               !is_uvm_fault_force_sysmem_set() &&
+               !uvm_hmm_must_use_sysmem(block, va_block_context))) &&
             uvm_va_space_processor_has_memory(va_space, id)) {
        // When operating under read-duplication we should only map the local
        // processor to cause fault-and-duplicate of remote pages.
@@ -7285,7 +7289,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,

    // Map per resident location so we can more easily detect physically-
    // contiguous mappings.
-    map_get_allowed_destinations(va_block, va_block_context->policy, id, &allowed_destinations);
+    map_get_allowed_destinations(va_block, va_block_context, va_block_context->policy, id, &allowed_destinations);

    for_each_closest_id(resident_id, &allowed_destinations, id, va_space) {
        if (UVM_ID_IS_CPU(id)) {
--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -418,15 +418,6 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
    uvm_global_processor_mask_t retained_gpus;
    LIST_HEAD(deferred_free_list);

-    // Normally we'd expect this to happen as part of uvm_mm_release()
-    // but if userspace never initialized uvm_mm_fd that won't happen.
-    // We don't have to take the va_space_mm spinlock and update state
-    // here because we know no other thread can be in or subsequently
-    // call uvm_api_mm_initialize successfully because the UVM
-    // file-descriptor has been released.
-    if (va_space->va_space_mm.state == UVM_VA_SPACE_MM_STATE_UNINITIALIZED)
-        uvm_va_space_mm_unregister(va_space);
-
    // Remove the VA space from the global list before we start tearing things
    // down so other threads can't see the VA space in a partially-valid state.
    uvm_mutex_lock(&g_uvm_global.va_spaces.lock);
@@ -532,7 +523,14 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)

    uvm_deferred_free_object_list(&deferred_free_list);

-    // MM FD teardown should already have destroyed va_space_mm
+    // Normally we'd expect this to happen as part of uvm_mm_release()
+    // but if userspace never initialized uvm_mm_fd that won't happen.
+    // We don't have to take the va_space_mm spinlock and update state
+    // here because we know no other thread can be in or subsequently
+    // call uvm_api_mm_initialize successfully because the UVM
+    // file-descriptor has been released.
+    if (va_space->va_space_mm.state == UVM_VA_SPACE_MM_STATE_UNINITIALIZED)
+        uvm_va_space_mm_unregister(va_space);
    UVM_ASSERT(!uvm_va_space_mm_alive(&va_space->va_space_mm));

    uvm_mutex_lock(&g_uvm_global.global_lock);