535.54.03

2026-03-13 09:09:53 +00:00 · 2023-06-14 12:37:59 -07:00
parent eb5c7665a1
commit 26458140be
120 changed files with 83370 additions and 81507 deletions
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.43.02\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.54.03\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@@ -510,6 +510,12 @@ struct nv_file_private_t
    nv_file_private_t *ctl_nvfp;
    void *ctl_nvfp_priv;
    NvU32 register_or_refcount;
+
+    //
+    // True if a client or an event was ever allocated on this fd.
+    // If false, RMAPI cleanup is skipped.
+    //
+    NvBool bCleanupRmapi;
 };

 // Forward define the gpu ops structures
@@ -959,6 +965,8 @@ NV_STATUS  NV_API_CALL  rm_perform_version_check  (nvidia_stack_t *, void *, NvU

 void       NV_API_CALL  rm_power_source_change_event        (nvidia_stack_t *, NvU32);

+void       NV_API_CALL  rm_request_dnotifier_state          (nvidia_stack_t *, nv_state_t *);
+
 void       NV_API_CALL  rm_disable_gpu_state_persistence    (nvidia_stack_t *sp, nv_state_t *);
 NV_STATUS  NV_API_CALL  rm_p2p_init_mapping       (nvidia_stack_t *, NvU64, NvU64 *, NvU64 *, NvU64 *, NvU64 *, NvU64, NvU64, NvU64, NvU64, void (*)(void *), void *);
 NV_STATUS  NV_API_CALL  rm_p2p_destroy_mapping    (nvidia_stack_t *, NvU64);
--- a/kernel-open/common/inc/nv_uvm_interface.h
+++ b/kernel-open/common/inc/nv_uvm_interface.h
@@ -1455,12 +1455,12 @@ NV_STATUS nvUvmInterfacePagingChannelPushStream(UvmGpuPagingChannelHandle channe
    concurrently with the same UvmCslContext parameter in different threads. The caller must
    guarantee this exclusion.

-    * nvUvmInterfaceCslLogDeviceEncryption
    * nvUvmInterfaceCslRotateIv
    * nvUvmInterfaceCslEncrypt
    * nvUvmInterfaceCslDecrypt
    * nvUvmInterfaceCslSign
    * nvUvmInterfaceCslQueryMessagePool
+    * nvUvmInterfaceCslIncrementIv
 */

 /*******************************************************************************
@@ -1495,62 +1495,17 @@ NV_STATUS nvUvmInterfaceCslInitContext(UvmCslContext *uvmCslContext,
 */
 void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);

-
-/*******************************************************************************
-    nvUvmInterfaceCslLogDeviceEncryption
-
-    Returns an IV that can be later used in the nvUvmInterfaceCslEncrypt
-    method. The IV contains a "freshness bit" which value is set by this method
-    and subsequently dirtied by nvUvmInterfaceCslEncrypt to prevent
-    non-malicious reuse of the IV.
-
-    See "CSL Interface and Locking" for locking requirements.
-    This function does not perform dynamic memory allocation.
-
-    Arguments:
-        uvmCslContext[IN/OUT] - The CSL context.
-        encryptIv[OUT]        - Parameter that is stored before a successful
-                                device encryption. It is used as an input to
-                                nvUvmInterfaceCslEncrypt.
-
-    Error codes:
-      NV_ERR_INSUFFICIENT_RESOURCES - New IV would cause a counter to overflow.
-*/
-NV_STATUS nvUvmInterfaceCslAcquireEncryptionIv(UvmCslContext *uvmCslContext,
-                                               UvmCslIv *encryptIv);
-
-/*******************************************************************************
-    nvUvmInterfaceCslLogDeviceEncryption
-
-    Logs and checks information about device encryption.
-
-    See "CSL Interface and Locking" for locking requirements.
-    This function does not perform dynamic memory allocation.
-
-    Arguments:
-        uvmCslContext[IN/OUT] - The CSL context.
-        decryptIv[OUT]        - Parameter that is stored before a successful
-                                device encryption. It is used as an input to
-                                nvUvmInterfaceCslDecrypt.
-
-    Error codes:
-      NV_ERR_INSUFFICIENT_RESOURCES - The device encryption would cause a counter
-                                      to overflow.
-*/
-NV_STATUS nvUvmInterfaceCslLogDeviceEncryption(UvmCslContext *uvmCslContext,
-                                               UvmCslIv *decryptIv);
-
 /*******************************************************************************
    nvUvmInterfaceCslRotateIv

-    Rotates the IV for a given channel and direction.
+    Rotates the IV for a given channel and operation.

    This function will rotate the IV on both the CPU and the GPU.
    Outstanding messages that have been encrypted by the GPU should first be
-    decrypted before calling this function with direction equal to
-    UVM_CSL_DIR_GPU_TO_CPU. Similiarly, outstanding messages that have been
+    decrypted before calling this function with operation equal to
+    UVM_CSL_OPERATION_DECRYPT. Similarly, outstanding messages that have been
    encrypted by the CPU should first be decrypted before calling this function
-    with direction equal to UVM_CSL_DIR_CPU_TO_GPU. For a given direction
+    with operation equal to UVM_CSL_OPERATION_ENCRYPT. For a given operation
    the channel must be idle before calling this function. This function can be
    called regardless of the value of the IV's message counter.

@@ -1559,17 +1514,17 @@ NV_STATUS nvUvmInterfaceCslLogDeviceEncryption(UvmCslContext *uvmCslContext,

 Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
-        direction[IN]         - Either
-                                - UVM_CSL_DIR_CPU_TO_GPU
-                                - UVM_CSL_DIR_GPU_TO_CPU
+        operation[IN]         - Either
+                                - UVM_CSL_OPERATION_ENCRYPT
+                                - UVM_CSL_OPERATION_DECRYPT

    Error codes:
      NV_ERR_INSUFFICIENT_RESOURCES - The rotate operation would cause a counter
                                      to overflow.
-      NV_ERR_INVALID_ARGUMENT       - Invalid value for direction.
+      NV_ERR_INVALID_ARGUMENT       - Invalid value for operation.
 */
 NV_STATUS nvUvmInterfaceCslRotateIv(UvmCslContext *uvmCslContext,
-                                    UvmCslDirection direction);
+                                    UvmCslOperation operation);

 /*******************************************************************************
    nvUvmInterfaceCslEncrypt
@@ -1580,7 +1535,7 @@ NV_STATUS nvUvmInterfaceCslRotateIv(UvmCslContext *uvmCslContext,
    this function produces undefined behavior. Performance is typically
    maximized when the input and output buffers are 16-byte aligned. This is
    natural alignment for AES block.
-    The encryptIV can be obtained from nvUvmInterfaceCslAcquireEncryptionIv.
+    The encryptIV can be obtained from nvUvmInterfaceCslIncrementIv.
    However, it is optional. If it is NULL, the next IV in line will be used.

    See "CSL Interface and Locking" for locking requirements.
@@ -1623,12 +1578,18 @@ NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,

    Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
-        bufferSize[IN]        - Size of the input and output buffers in
-                                units of bytes. Value can range from 1 byte
-                                to (2^32) - 1 bytes.
-        decryptIv[IN]         - Parameter given by nvUvmInterfaceCslLogDeviceEncryption.
+        bufferSize[IN]        - Size of the input and output buffers in units of bytes.
+                                Value can range from 1 byte to (2^32) - 1 bytes.
+        decryptIv[IN]         - IV used to decrypt the ciphertext. Its value can either be given by
+                                nvUvmInterfaceCslIncrementIv, or, if NULL, the CSL context's
+                                internal counter is used.
        inputBuffer[IN]       - Address of ciphertext input buffer.
        outputBuffer[OUT]     - Address of plaintext output buffer.
+        addAuthData[IN]       - Address of the plaintext additional authenticated data used to
+                                calculate the authentication tag. Can be NULL.
+        addAuthDataSize[IN]   - Size of the additional authenticated data in units of bytes.
+                                Value can range from 1 byte to (2^32) - 1 bytes.
+                                This parameter is ignored if addAuthData is NULL.
        authTagBuffer[IN]     - Address of authentication tag buffer.
                                Its size is UVM_CSL_CRYPT_AUTH_TAG_SIZE_BYTES.

@@ -1643,6 +1604,8 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
                                   NvU8 const *inputBuffer,
                                   UvmCslIv const *decryptIv,
                                   NvU8 *outputBuffer,
+                                   NvU8 const *addAuthData,
+                                   NvU32 addAuthDataSize,
                                   NvU8 const *authTagBuffer);

 /*******************************************************************************
@@ -1673,7 +1636,6 @@ NV_STATUS nvUvmInterfaceCslSign(UvmCslContext *uvmCslContext,
                                NvU8 const *inputBuffer,
                                NvU8 *authTagBuffer);

-
 /*******************************************************************************
    nvUvmInterfaceCslQueryMessagePool

@@ -1684,14 +1646,45 @@ NV_STATUS nvUvmInterfaceCslSign(UvmCslContext *uvmCslContext,

    Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
-        direction[IN]         - Either UVM_CSL_DIR_CPU_TO_GPU or UVM_CSL_DIR_GPU_TO_CPU.
+        operation[IN]         - Either UVM_CSL_OPERATION_ENCRYPT or UVM_CSL_OPERATION_DECRYPT.
        messageNum[OUT]       - Number of messages left before overflow.

    Error codes:
-      NV_ERR_INVALID_ARGUMENT - The value of the direction parameter is illegal.
+      NV_ERR_INVALID_ARGUMENT - The value of the operation parameter is illegal.
 */
 NV_STATUS nvUvmInterfaceCslQueryMessagePool(UvmCslContext *uvmCslContext,
-                                            UvmCslDirection direction,
+                                            UvmCslOperation operation,
                                            NvU64 *messageNum);

+/*******************************************************************************
+    nvUvmInterfaceCslIncrementIv
+
+    Increments the message counter by the specified amount.
+
+    If iv is non-NULL then the incremented value is returned.
+    If operation is UVM_CSL_OPERATION_ENCRYPT then the returned IV's "freshness" bit is set and
+    can be used in nvUvmInterfaceCslEncrypt. If operation is UVM_CSL_OPERATION_DECRYPT then
+    the returned IV can be used in nvUvmInterfaceCslDecrypt.
+
+    See "CSL Interface and Locking" for locking requirements.
+    This function does not perform dynamic memory allocation.
+
+Arguments:
+        uvmCslContext[IN/OUT] - The CSL context.
+        operation[IN]         - Either
+                                - UVM_CSL_OPERATION_ENCRYPT
+                                - UVM_CSL_OPERATION_DECRYPT
+        increment[IN]         - The amount by which the IV is incremented. Can be 0.
+        iv[out]               - If non-NULL, a buffer to store the incremented IV.
+
+    Error codes:
+      NV_ERR_INVALID_ARGUMENT       - The value of the operation parameter is illegal.
+      NV_ERR_INSUFFICIENT_RESOURCES - Incrementing the message counter would result
+                                      in an overflow.
+*/
+NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
+                                       UvmCslOperation operation,
+                                       NvU64 increment,
+                                       UvmCslIv *iv);
+
 #endif // _NV_UVM_INTERFACE_H_
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@@ -286,6 +286,7 @@ typedef struct UvmGpuChannelInfo_tag
    // so a channel can be controlled via another channel (SEC2 or WLC/LCIC)
    NvU64             gpFifoGpuVa;
    NvU64             gpPutGpuVa;
+    NvU64             gpGetGpuVa;
    // GPU VA of work submission offset is needed in Confidential Computing
    // so CE channels can ring doorbell of other channels as required for
    // WLC/LCIC work submission
@@ -1060,10 +1061,10 @@ typedef struct UvmCslIv
    NvU8 fresh;
 } UvmCslIv;

-typedef enum UvmCslDirection
+typedef enum UvmCslOperation
 {
-    UVM_CSL_DIR_CPU_TO_GPU,
-    UVM_CSL_DIR_GPU_TO_CPU
-} UvmCslDirection;
+    UVM_CSL_OPERATION_ENCRYPT,
+    UVM_CSL_OPERATION_DECRYPT
+} UvmCslOperation;

 #endif // _NV_UVM_TYPES_H_
--- a/kernel-open/common/inc/rm-gpu-ops.h
+++ b/kernel-open/common/inc/rm-gpu-ops.h
@@ -103,13 +103,12 @@ NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channel_push_stream(nvidia_stack_t *, n

 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_init(nvidia_stack_t *, struct ccslContext_t **, nvgpuChannelHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_clear(nvidia_stack_t *, struct ccslContext_t *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_log_device_encryption(nvidia_stack_t *, struct ccslContext_t *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_rotate_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_acquire_encryption_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt_with_iv(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8*, NvU8 *, NvU8 *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU8 *, NvU8 const *);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU8 *, NvU8 const *, NvU32, NvU8 const *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_sign(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_query_message_pool(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64 *);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_increment_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64, NvU8 *);

 #endif
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -919,6 +919,21 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_VFIO_MIGRATION_OPS_PRESENT" "" "types"
        ;;

+        vfio_precopy_info)
+            #
+            # Determine if vfio_precopy_info struct is present or not
+            #
+            # Added by commit 4db52602a6074 ("vfio: Extend the device migration
+            # protocol with PRE_COPY" in v6.2
+            #
+            CODE="
+            #include <linux/vfio.h>
+            struct vfio_precopy_info precopy_info;
+            "
+
+            compile_check_conftest "$CODE" "NV_VFIO_PRECOPY_INFO_PRESENT" "" "types"
+        ;;
+
        vfio_log_ops)
            #
            # Determine if vfio_log_ops struct is present or not
--- a/kernel-open/nvidia-drm/nvidia-drm-gem.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem.h
@@ -179,6 +179,7 @@ static inline int nv_drm_gem_handle_create(struct drm_file *filp,
    return drm_gem_handle_create(filp, &nv_gem->base, handle);
 }

+#if defined(NV_DRM_FENCE_AVAILABLE)
 static inline nv_dma_resv_t *nv_drm_gem_res_obj(struct nv_drm_gem_object *nv_gem)
 {
 #if defined(NV_DRM_GEM_OBJECT_HAS_RESV)
@@ -187,6 +188,7 @@ static inline nv_dma_resv_t *nv_drm_gem_res_obj(struct nv_drm_gem_object *nv_gem
    return nv_gem->base.dma_buf ? nv_gem->base.dma_buf->resv : &nv_gem->resv;
 #endif
 }
+#endif

 void nv_drm_gem_object_init(struct nv_drm_device *nv_dev,
                            struct nv_drm_gem_object *nv_gem,
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@@ -338,11 +338,6 @@ static NV_STATUS test_memcpy_and_memset_inner(uvm_gpu_t *gpu,
        return NV_OK;
    }

-    if (!gpu->parent->ce_hal->memcopy_is_valid(&push, dst, src)) {
-        TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
-        return NV_OK;
-    }
-
    // The input virtual addresses exist in UVM's internal address space, not
    // the proxy address space
    if (uvm_channel_is_proxy(push.channel)) {
@@ -401,7 +396,7 @@ static NV_STATUS test_memcpy_and_memset_inner(uvm_gpu_t *gpu,
 static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
 {
    NV_STATUS status = NV_OK;
-    bool is_proxy_va_space;
+    bool is_proxy_va_space = false;
    uvm_gpu_address_t gpu_verif_addr;
    void *cpu_verif_addr;
    uvm_mem_t *verif_mem = NULL;
@@ -437,6 +432,34 @@ static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
        }
    }

+    // Virtual address (in UVM's internal address space) backed by sysmem
+    TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &sys_rm_mem), done);
+    gpu_addresses[0] = uvm_rm_mem_get_gpu_va(sys_rm_mem, gpu, is_proxy_va_space);
+
+    if (uvm_conf_computing_mode_enabled(gpu)) {
+        for (i = 0; i < iterations; ++i) {
+            for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
+                TEST_NV_CHECK_GOTO(test_memcpy_and_memset_inner(gpu,
+                                                                gpu_addresses[0],
+                                                                gpu_addresses[0],
+                                                                size,
+                                                                element_sizes[s],
+                                                                gpu_verif_addr,
+                                                                cpu_verif_addr,
+                                                                i),
+                                    done);
+
+            }
+        }
+
+        // Because gpu_verif_addr is in sysmem, when the Confidential
+        // Computing feature is enabled, only the previous cases are valid.
+        // TODO: Bug 3839176: the test partially waived on Confidential
+        // Computing because it assumes that GPU can access system memory
+        // without using encryption.
+        goto done;
+    }
+
    // Using a page size equal to the allocation size ensures that the UVM
    // memories about to be allocated are physically contiguous. And since the
    // size is a valid GPU page size, the memories can be virtually mapped on
@@ -448,37 +471,22 @@ static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
    // Physical address in sysmem
    TEST_NV_CHECK_GOTO(uvm_mem_alloc(&mem_params, &sys_uvm_mem), done);
    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_phys(sys_uvm_mem, gpu), done);
-    gpu_addresses[0] = uvm_mem_gpu_address_physical(sys_uvm_mem, gpu, 0, size);
+    gpu_addresses[1] = uvm_mem_gpu_address_physical(sys_uvm_mem, gpu, 0, size);

    // Physical address in vidmem
    mem_params.backing_gpu = gpu;
    TEST_NV_CHECK_GOTO(uvm_mem_alloc(&mem_params, &gpu_uvm_mem), done);
-    gpu_addresses[1] = uvm_mem_gpu_address_physical(gpu_uvm_mem, gpu, 0, size);
+    gpu_addresses[2] = uvm_mem_gpu_address_physical(gpu_uvm_mem, gpu, 0, size);

    // Virtual address (in UVM's internal address space) backed by vidmem
    TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_GPU, size, 0, &gpu_rm_mem), done);
-    is_proxy_va_space = false;
-    gpu_addresses[2] = uvm_rm_mem_get_gpu_va(gpu_rm_mem, gpu, is_proxy_va_space);
+    gpu_addresses[3] = uvm_rm_mem_get_gpu_va(gpu_rm_mem, gpu, is_proxy_va_space);

-    // Virtual address (in UVM's internal address space) backed by sysmem
-    TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &sys_rm_mem), done);
-    gpu_addresses[3] = uvm_rm_mem_get_gpu_va(sys_rm_mem, gpu, is_proxy_va_space);

    for (i = 0; i < iterations; ++i) {
        for (j = 0; j < ARRAY_SIZE(gpu_addresses); ++j) {
            for (k = 0; k < ARRAY_SIZE(gpu_addresses); ++k) {
                for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
-                  // Because gpu_verif_addr is in sysmem, when the Confidential
-                  // Computing feature is enabled, only the following cases are
-                  // valid.
-                  //
-                  // TODO: Bug 3839176: the test partially waived on
-                  // Confidential Computing because it assumes that GPU can
-                  // access system memory without using encryption.
-                  if (uvm_conf_computing_mode_enabled(gpu) &&
-                      !(gpu_addresses[k].is_unprotected && gpu_addresses[j].is_unprotected)) {
-                        continue;
-                  }
                    TEST_NV_CHECK_GOTO(test_memcpy_and_memset_inner(gpu,
                                                                    gpu_addresses[k],
                                                                    gpu_addresses[j],
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -750,9 +750,9 @@ static void internal_channel_submit_work_wlc(uvm_push_t *push)
                       wlc_channel->channel_info.workSubmissionToken);
 }

-static NV_STATUS internal_channel_submit_work_indirect(uvm_push_t *push,
-                                                       NvU32 old_cpu_put,
-                                                       NvU32 new_gpu_put)
+static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push,
+                                                      NvU32 old_cpu_put,
+                                                      NvU32 new_gpu_put)
 {
    uvm_pushbuffer_t *pushbuffer = push->channel->pool->manager->pushbuffer;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
@@ -765,10 +765,211 @@ static NV_STATUS internal_channel_submit_work_indirect(uvm_push_t *push,
    NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
    void *push_enc_auth_tag;
    uvm_gpu_address_t push_enc_auth_tag_gpu;
-    NvU64 gpfifo_gpu = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry);
+    NvU64 gpfifo_gpu_va = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry);
+
+    UVM_ASSERT(!uvm_channel_is_sec2(push->channel));
+    UVM_ASSERT(uvm_channel_is_wlc(push->launch_channel));
+
+    // WLC submissions are done under channel lock, so there should be no
+    // contention to get the right submission order.
+    UVM_ASSERT(push->channel->conf_computing.gpu_put == old_cpu_put);
+
+    // This can never stall or return error. WLC launch after WLC channels are
+    // initialized uses private static pb space and it neither needs the general
+    // PB space, nor it counts towards max concurrent pushes.
+    status = uvm_push_begin_on_reserved_channel(push->launch_channel,
+                                                &indirect_push,
+                                                "Worklaunch to '%s' via '%s'",
+                                                push->channel->name,
+                                                push->launch_channel->name);
+    UVM_ASSERT(status == NV_OK);
+
+
+    // Move over the pushbuffer data
+    // WLC channels use a static preallocated space for launch auth tags
+    push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu;
+    push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va);
+
+    uvm_conf_computing_cpu_encrypt(indirect_push.channel,
+                                   push_enc_cpu,
+                                   push->begin,
+                                   NULL,
+                                   uvm_push_get_size(push),
+                                   push_enc_auth_tag);
+
+    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+
+    gpu->parent->ce_hal->decrypt(&indirect_push,
+                                 uvm_gpu_address_virtual(uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push)),
+                                 uvm_gpu_address_virtual(push_enc_gpu),
+                                 uvm_push_get_size(push),
+                                 push_enc_auth_tag_gpu);
+
+    gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_entry,
+                                            uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
+                                            uvm_push_get_size(push),
+                                            UVM_GPFIFO_SYNC_PROCEED);
+
+    gpu->parent->ce_hal->memset_8(&indirect_push,
+                                  uvm_gpu_address_virtual(gpfifo_gpu_va),
+                                  gpfifo_entry,
+                                  sizeof(gpfifo_entry));
+
+    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+    do_semaphore_release(&indirect_push, push->channel->channel_info.gpPutGpuVa, new_gpu_put);
+
+    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+    do_semaphore_release(&indirect_push,
+                         push->channel->channel_info.workSubmissionOffsetGpuVa,
+                         push->channel->channel_info.workSubmissionToken);
+
+    // Ignore return value of push_wait. It can only fail with channel error
+    // which will be detected when waiting for the primary push.
+    (void)uvm_push_end_and_wait(&indirect_push);
+
+    push->channel->conf_computing.gpu_put = new_gpu_put;
+}
+
+static void update_gpput_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel, NvU32 new_gpu_put)
+{
+    uvm_gpu_t *gpu = uvm_push_get_gpu(sec2_push);
+    void *gpput_auth_tag_cpu, *gpput_enc_cpu;
+    uvm_gpu_address_t gpput_auth_tag_gpu, gpput_enc_gpu;
+    NvU32 gpput_scratchpad[UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT/sizeof(new_gpu_put)];
+
+    UVM_ASSERT(uvm_channel_is_sec2(sec2_push->channel));
+
+    gpput_enc_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
+                                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
+                                                      &gpput_enc_gpu);
+    gpput_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                           UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                                           UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                                           &gpput_auth_tag_gpu);
+
+
+
+    // Update GPPUT. The update needs 4B write to specific offset,
+    // however we can only do 16B aligned decrypt writes.
+    // A poison value is written to all other locations, this is ignored in
+    // most locations and overwritten by HW for GPGET location
+    memset(gpput_scratchpad, 0, sizeof(gpput_scratchpad));
+    UVM_ASSERT(sizeof(*gpput_scratchpad) == sizeof(new_gpu_put));
+    gpput_scratchpad[(channel->channel_info.gpPutGpuVa % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT) /
+                     sizeof(*gpput_scratchpad)] = new_gpu_put;
+
+    // Set value of GPGET to be the same as GPPUT. It will be overwritten by
+    // HW next time GET value changes. UVM never reads GPGET.
+    // However, RM does read it when freeing a channel. When this function
+    // is called from 'channel_manager_stop_wlc' we set the value of GPGET
+    // to the same value as GPPUT. Mismatch between these two values makes
+    // RM wait for any "pending" tasks, leading to significant delays in the
+    // channel teardown sequence.
+    UVM_ASSERT(channel->channel_info.gpPutGpuVa / UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT ==
+               channel->channel_info.gpGetGpuVa / UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
+    gpput_scratchpad[(channel->channel_info.gpGetGpuVa % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT) /
+                     sizeof(*gpput_scratchpad)] = new_gpu_put;
+
+    uvm_conf_computing_cpu_encrypt(sec2_push->channel,
+                                   gpput_enc_cpu,
+                                   gpput_scratchpad,
+                                   NULL,
+                                   sizeof(gpput_scratchpad),
+                                   gpput_auth_tag_cpu);
+    gpu->parent->sec2_hal->decrypt(sec2_push,
+                                   UVM_ALIGN_DOWN(channel->channel_info.gpPutGpuVa,
+                                                  UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT),
+                                   gpput_enc_gpu.address,
+                                   sizeof(gpput_scratchpad),
+                                   gpput_auth_tag_gpu.address);
+}
+
+static void set_gpfifo_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel, NvU32 put, NvU64 value)
+{
+    uvm_gpu_t *gpu = uvm_push_get_gpu(sec2_push);
+    void *gpfifo_auth_tag_cpu, *gpfifo_enc_cpu;
+    uvm_gpu_address_t gpfifo_auth_tag_gpu, gpfifo_enc_gpu;
+    NvU64 gpfifo_gpu = channel->channel_info.gpFifoGpuVa + put * sizeof(value);
+    NvU64 gpfifo_scratchpad[2];
+
+    UVM_ASSERT(uvm_channel_is_sec2(sec2_push->channel));
+
+    gpfifo_enc_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                       sizeof(gpfifo_scratchpad),
+                                                       UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
+                                                       &gpfifo_enc_gpu);
+    gpfifo_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push,
+                                                            UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                                            UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                                            &gpfifo_auth_tag_gpu);
+
+    if (IS_ALIGNED(gpfifo_gpu, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT)) {
+        gpfifo_scratchpad[0] = value;
+
+        // Set the value of the odd entry to noop.
+        // It will be overwritten when the next entry is submitted.
+        gpu->parent->host_hal->set_gpfifo_noop(&gpfifo_scratchpad[1]);
+    }
+    else {
+        uvm_gpfifo_entry_t *previous_gpfifo;
+
+        UVM_ASSERT(put > 0);
+
+        previous_gpfifo = &channel->gpfifo_entries[put - 1];
+
+        if (previous_gpfifo->type ==  UVM_GPFIFO_ENTRY_TYPE_CONTROL) {
+            gpfifo_scratchpad[0] = previous_gpfifo->control_value;
+        }
+        else {
+            uvm_pushbuffer_t *pushbuffer = channel->pool->manager->pushbuffer;
+            NvU64 prev_pb_va = uvm_pushbuffer_get_gpu_va_base(pushbuffer) + previous_gpfifo->pushbuffer_offset;
+
+            // Reconstruct the previous gpfifo entry. UVM_GPFIFO_SYNC_WAIT is
+            // used only in static WLC schedule.
+            // Overwriting the previous entry with the same value doesn't hurt,
+            // whether the previous entry has been processed or not
+            gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_scratchpad[0],
+                                                    prev_pb_va,
+                                                    previous_gpfifo->pushbuffer_size,
+                                                    UVM_GPFIFO_SYNC_PROCEED);
+        }
+
+        gpfifo_scratchpad[1] = value;
+    }
+
+    uvm_conf_computing_cpu_encrypt(sec2_push->channel,
+                                   gpfifo_enc_cpu,
+                                   gpfifo_scratchpad,
+                                   NULL,
+                                   sizeof(gpfifo_scratchpad),
+                                   gpfifo_auth_tag_cpu);
+    gpu->parent->sec2_hal->decrypt(sec2_push,
+                                   UVM_ALIGN_DOWN(gpfifo_gpu, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT),
+                                   gpfifo_enc_gpu.address,
+                                   sizeof(gpfifo_scratchpad),
+                                   gpfifo_auth_tag_gpu.address);
+}
+
+static NV_STATUS internal_channel_submit_work_indirect_sec2(uvm_push_t *push,
+                                                            NvU32 old_cpu_put,
+                                                            NvU32 new_gpu_put)
+{
+    uvm_pushbuffer_t *pushbuffer = push->channel->pool->manager->pushbuffer;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    uvm_push_t indirect_push;
+    NV_STATUS status;
+    NvU64 gpfifo_entry;
+
+    void *push_enc_cpu = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push);
+    NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
+    void *push_auth_tag_cpu;
+    uvm_gpu_address_t push_auth_tag_gpu;
    uvm_spin_loop_t spin;

    UVM_ASSERT(!uvm_channel_is_sec2(push->channel));
+    UVM_ASSERT(uvm_channel_is_sec2(push->launch_channel));

    // If the old_cpu_put is not equal to the last gpu put, other pushes are
    // pending that need to be submitted. That push/es' submission will update
@@ -790,60 +991,36 @@ static NV_STATUS internal_channel_submit_work_indirect(uvm_push_t *push,


    // Move over the pushbuffer data
-    if (uvm_channel_is_sec2(indirect_push.channel)) {
-        push_enc_auth_tag = uvm_push_get_single_inline_buffer(&indirect_push,
-                                                              UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                              UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                              &push_enc_auth_tag_gpu);
-    }
-    else {
-        // Auth tags cannot be in protected vidmem.
-        // WLC channels use a static preallocated space for launch auth tags
-        push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu;
-        push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va);
-    }
+    push_auth_tag_cpu = uvm_push_get_single_inline_buffer(&indirect_push,
+                                                          UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                                          &push_auth_tag_gpu);

    uvm_conf_computing_cpu_encrypt(indirect_push.channel,
                                   push_enc_cpu,
                                   push->begin,
                                   NULL,
                                   uvm_push_get_size(push),
-                                   push_enc_auth_tag);
+                                   push_auth_tag_cpu);

    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);

-    if (uvm_channel_is_sec2(indirect_push.channel)) {
-        gpu->parent->sec2_hal->decrypt(&indirect_push,
-                                       uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
-                                       push_enc_gpu,
-                                       uvm_push_get_size(push),
-                                       push_enc_auth_tag_gpu.address);
-    }
-    else {
-        gpu->parent->ce_hal->decrypt(&indirect_push,
-                                     uvm_gpu_address_virtual(uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push)),
-                                     uvm_gpu_address_virtual(push_enc_gpu),
-                                     uvm_push_get_size(push),
-                                     push_enc_auth_tag_gpu);
-    }
+    gpu->parent->sec2_hal->decrypt(&indirect_push,
+                                   uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
+                                   push_enc_gpu,
+                                   uvm_push_get_size(push),
+                                   push_auth_tag_gpu.address);

    gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_entry,
                                            uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push),
                                            uvm_push_get_size(push),
                                            UVM_GPFIFO_SYNC_PROCEED);

-    // TODO: Bug 2649842: RFE - Investigate using 64-bit semaphore
-    // SEC2 needs encrypt decrypt to be 16B aligned GPFIFO entries are only 8B
-    // Use 2x semaphore release to set the values directly.
-    // We could use a single 64 bit release if it were available
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu, NvU64_LO32(gpfifo_entry));
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu + 4, NvU64_HI32(gpfifo_entry));

-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
-    do_semaphore_release(&indirect_push, push->channel->channel_info.gpPutGpuVa, new_gpu_put);
+    set_gpfifo_via_sec2(&indirect_push, push->channel, old_cpu_put, gpfifo_entry);
+    update_gpput_via_sec2(&indirect_push, push->channel, new_gpu_put);

+    // Ring the doorbell
    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
    do_semaphore_release(&indirect_push,
                         push->channel->channel_info.workSubmissionOffsetGpuVa,
@@ -930,11 +1107,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    }
    else if (uvm_conf_computing_mode_enabled(channel_manager->gpu) && !uvm_channel_is_sec2(channel)) {
        if (uvm_channel_manager_is_wlc_ready(channel_manager)) {
-            NV_STATUS status = internal_channel_submit_work_indirect(push, cpu_put, new_cpu_put);
-
-            // This codepath should only be used during initialization and thus
-            // NEVER return an error.
-            UVM_ASSERT(status == NV_OK);
+            internal_channel_submit_work_indirect_wlc(push, cpu_put, new_cpu_put);
        }
        else {
            // submitting via SEC2 starts a push, postpone until this push is ended
@@ -963,7 +1136,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    wmb();

    if (needs_sec2_work_submit) {
-        NV_STATUS status = internal_channel_submit_work_indirect(push, cpu_put, new_cpu_put);
+        NV_STATUS status = internal_channel_submit_work_indirect_sec2(push, cpu_put, new_cpu_put);

        // This codepath should only be used during initialization and thus
        // NEVER return an error.
@@ -1007,7 +1180,6 @@ static NV_STATUS submit_ctrl_gpfifo_indirect(uvm_channel_t *channel,
    uvm_channel_type_t indirect_channel_type = uvm_channel_manager_is_wlc_ready(channel->pool->manager) ?
                                               UVM_CHANNEL_TYPE_WLC :
                                               UVM_CHANNEL_TYPE_SEC2;
-    NvU64 gpfifo_gpu = channel->channel_info.gpFifoGpuVa + (old_cpu_put * sizeof(entry->control_value));

    UVM_ASSERT(!uvm_channel_is_sec2(channel));

@@ -1026,17 +1198,26 @@ static NV_STATUS submit_ctrl_gpfifo_indirect(uvm_channel_t *channel,
    if (status != NV_OK)
        return status;

-    // TODO: Bug 2649842: RFE - Investigate using 64-bit semaphore
-    // SEC2 needs encrypt decrypt to be 16B aligned GPFIFO entries are only 8B
-    // Use 2x semaphore release to set the values directly.
-    // One 64bit semahore release can be used instead once implemented.
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu, NvU64_LO32(entry->control_value));
-    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-    do_semaphore_release(&indirect_push, gpfifo_gpu + 4,  NvU64_HI32(entry->control_value));
+    if (uvm_channel_is_sec2(indirect_push.channel)) {
+        set_gpfifo_via_sec2(&indirect_push, channel, old_cpu_put, entry->control_value);
+        update_gpput_via_sec2(&indirect_push, channel, new_gpu_put);
+    } else {
+        uvm_gpu_t *gpu = uvm_push_get_gpu(&indirect_push);
+        NvU64 gpfifo_gpu_va = channel->channel_info.gpFifoGpuVa + (old_cpu_put * sizeof(entry->control_value));
+
+        gpu->parent->ce_hal->memset_8(&indirect_push,
+                                      uvm_gpu_address_virtual(gpfifo_gpu_va),
+                                      entry->control_value,
+                                      sizeof(entry->control_value));
+
+        uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+        do_semaphore_release(&indirect_push, channel->channel_info.gpPutGpuVa, new_gpu_put);
+    }

    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
-    do_semaphore_release(&indirect_push, channel->channel_info.gpPutGpuVa, new_gpu_put);
+    do_semaphore_release(&indirect_push,
+                         channel->channel_info.workSubmissionOffsetGpuVa,
+                         channel->channel_info.workSubmissionToken);

    status = uvm_push_end_and_wait(&indirect_push);
    if (status != NV_OK)
@@ -1044,9 +1225,6 @@ static NV_STATUS submit_ctrl_gpfifo_indirect(uvm_channel_t *channel,

    channel->conf_computing.gpu_put = new_gpu_put;

-    // The above SEC2 work transferred everything
-    // Ring the doorbell
-    UVM_GPU_WRITE_ONCE(*channel->channel_info.workSubmissionOffset, channel->channel_info.workSubmissionToken);
    return NV_OK;
 }

@@ -1445,17 +1623,21 @@ static NV_STATUS alloc_conf_computing_buffers_semaphore(uvm_channel_t *channel)
 static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
 {
    uvm_gpu_t *gpu = channel->pool->manager->gpu;
+    size_t aligned_wlc_push_size = UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
    NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
                                                    UVM_RM_MEM_TYPE_SYS,
-                                                    UVM_MAX_WLC_PUSH_SIZE + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2,
+                                                    aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2,
                                                    PAGE_SIZE,
                                                    &channel->conf_computing.static_pb_unprotected_sysmem);
    if (status != NV_OK)
        return status;

+    // Both pushes will be targets for SEC2 decrypt operations and have to
+    // be aligned for SEC2. The first push location will also be a target
+    // for CE decrypt operation and has to be aligned for CE decrypt.
    status = uvm_rm_mem_alloc(gpu,
                              UVM_RM_MEM_TYPE_GPU,
-                              UVM_MAX_WLC_PUSH_SIZE * 2,
+                              UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT) * 2,
                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
                              &channel->conf_computing.static_pb_protected_vidmem);
    if (status != NV_OK)
@@ -1464,16 +1646,16 @@ static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
    channel->conf_computing.static_pb_unprotected_sysmem_cpu =
        uvm_rm_mem_get_cpu_va(channel->conf_computing.static_pb_unprotected_sysmem);
    channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu =
-        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + UVM_MAX_WLC_PUSH_SIZE;
+        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + aligned_wlc_push_size;

    // The location below is only used for launch pushes but reuses
    // the same sysmem allocation
    channel->conf_computing.launch_auth_tag_cpu =
        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu +
-        UVM_MAX_WLC_PUSH_SIZE + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
+        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    channel->conf_computing.launch_auth_tag_gpu_va =
        uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_unprotected_sysmem, gpu) +
-        UVM_MAX_WLC_PUSH_SIZE + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
+        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;

    channel->conf_computing.static_pb_protected_sysmem = uvm_kvmalloc(UVM_MAX_WLC_PUSH_SIZE + UVM_PAGE_SIZE_4K);
    if (!channel->conf_computing.static_pb_protected_sysmem)
@@ -2576,7 +2758,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    // "decrypt_push" represents WLC decrypt push, constructed using fake_push.
    // Copied to wlc_pb_base + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
    // pushes that make the WLC fixed schedule.
-    NvU64 decrypt_push_protected_gpu = protected_vidmem + UVM_MAX_WLC_PUSH_SIZE;
+    NvU64 decrypt_push_protected_gpu = UVM_ALIGN_UP(protected_vidmem + UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT);
    NvU64 decrypt_push_unprotected_gpu = unprotected_sysmem_gpu + gpfifo_size;
    void *decrypt_push_unprotected_cpu = (char*)gpfifo_unprotected_cpu + gpfifo_size;

@@ -2587,7 +2769,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    BUILD_BUG_ON(sizeof(*wlc_gpfifo_entries) != sizeof(*wlc->channel_info.gpFifoEntries));

    UVM_ASSERT(uvm_channel_is_wlc(wlc));
-    UVM_ASSERT(tag_offset == UVM_MAX_WLC_PUSH_SIZE);
+    UVM_ASSERT(tag_offset == UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));

    // WLC schedule consists of two parts, the number of entries needs to be even.
    // This also guarantees that the size is 16B aligned
@@ -2692,11 +2874,9 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)

    // Prime the WLC by setting "PUT" two steps ahead. Reuse the current
    // cpu_put value that was used during channel initialization.
-    // Don't update wlc->cpu_put, it will be used to track
-    // submitted pushes as any other channel.
-    do_semaphore_release(&sec2_push,
-                         wlc->channel_info.gpPutGpuVa,
-                         (wlc->cpu_put + 2) % wlc->num_gpfifo_entries);
+    // Don't update wlc->cpu_put, it will be used to track submitted pushes
+    // as any other channel.
+    update_gpput_via_sec2(&sec2_push, wlc, (wlc->cpu_put + 2) % wlc->num_gpfifo_entries);

    status = uvm_push_end_and_wait(&sec2_push);

@@ -3048,9 +3228,7 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager)
        // Every gpfifo entry advances the gpu put of WLC by two so the current
        // value is: (cpu_put * 2) % num_gpfifo_entries and it's ahead of the
        // get pointer by 2.
-        do_semaphore_release(&push,
-                             channel->channel_info.gpPutGpuVa,
-                             (channel->cpu_put * 2 - 2) % channel->num_gpfifo_entries);
+        update_gpput_via_sec2(&push, channel, (channel->cpu_put * 2 - 2) % channel->num_gpfifo_entries);
    }

    status = uvm_push_end_and_wait(&push);
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@@ -378,11 +378,12 @@ void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
    NV_STATUS status;

    uvm_mutex_lock(&channel->csl.ctx_lock);
-    status = nvUvmInterfaceCslLogDeviceEncryption(&channel->csl.ctx, iv);
+    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, 1, iv);
    uvm_mutex_unlock(&channel->csl.ctx_lock);

-    // nvUvmInterfaceLogDeviceEncryption fails when a 64-bit encryption counter
-    // overflows. This is not supposed to happen on CC.
+    // TODO: Bug 4014720: If nvUvmInterfaceCslIncrementIv returns with
+    // NV_ERR_INSUFFICIENT_RESOURCES then the IV needs to be rotated via
+    // nvUvmInterfaceCslRotateIv.
    UVM_ASSERT(status == NV_OK);
 }

@@ -391,11 +392,12 @@ void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *
    NV_STATUS status;

    uvm_mutex_lock(&channel->csl.ctx_lock);
-    status = nvUvmInterfaceCslAcquireEncryptionIv(&channel->csl.ctx, iv);
+    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_ENCRYPT, 1, iv);
    uvm_mutex_unlock(&channel->csl.ctx_lock);

-    // nvUvmInterfaceLogDeviceEncryption fails when a 64-bit encryption counter
-    // overflows. This is not supposed to happen on CC.
+    // TODO: Bug 4014720: If nvUvmInterfaceCslIncrementIv returns with
+    // NV_ERR_INSUFFICIENT_RESOURCES then the IV needs to be rotated via
+    // nvUvmInterfaceCslRotateIv.
    UVM_ASSERT(status == NV_OK);
 }

@@ -439,6 +441,8 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                      (const NvU8 *) src_cipher,
                                      src_iv,
                                      (NvU8 *) dst_plain,
+                                      NULL,
+                                      0,
                                      (const NvU8 *) auth_tag_buffer);
    uvm_mutex_unlock(&channel->csl.ctx_lock);

--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@@ -42,9 +42,11 @@
 // Use sizeof(UvmCslIv) to refer to the IV size.
 #define UVM_CONF_COMPUTING_IV_ALIGNMENT 16

-// SEC2 decrypt operation buffers are required to be 16-bytes aligned. CE
-// encrypt/decrypt can be unaligned if the buffer lies in a single 32B segment.
-// Otherwise, they need to be 32B aligned.
+// SEC2 decrypt operation buffers are required to be 16-bytes aligned.
+#define UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT 16
+
+// CE encrypt/decrypt can be unaligned if the entire buffer lies in a single
+// 32B segment. Otherwise, it needs to be 32B aligned.
 #define UVM_CONF_COMPUTING_BUF_ALIGNMENT 32

 #define UVM_CONF_COMPUTING_DMA_BUFFER_SIZE UVM_VA_BLOCK_SIZE
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -2575,7 +2575,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
                continue;
            }

-            if (folio_test_swapcache(page_folio(src_page))) {
+            if (PageSwapCache(src_page)) {
                // TODO: Bug 4050579: Remove this when swap cached pages can be
                // migrated.
                if (service_context) {
--- a/kernel-open/nvidia-uvm/uvm_hopper_sec2.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_sec2.c
@@ -166,6 +166,7 @@ void uvm_hal_hopper_sec2_decrypt(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, N
    NvU32 *csl_sign_init = push->next;

    // Check that the provided alignment matches HW
+    BUILD_BUG_ON(UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT != (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)));
    BUILD_BUG_ON(UVM_CONF_COMPUTING_BUF_ALIGNMENT < (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)));
    BUILD_BUG_ON(UVM_CONF_COMPUTING_BUF_ALIGNMENT % (1 << HWSHIFT(CBA2, DECRYPT_COPY_DST_ADDR_LO, DATA)) != 0);

--- a/kernel-open/nvidia-uvm/uvm_pushbuffer.h
+++ b/kernel-open/nvidia-uvm/uvm_pushbuffer.h
@@ -161,22 +161,22 @@
 // * WFI:                     8B
 // Total:                    64B
 //
-// Push space needed for secure work launch is 224B. The push is constructed
+// Push space needed for secure work launch is 364B. The push is constructed
 // in 'internal_channel_submit_work_indirect' and 'uvm_channel_end_push'
 // * CE decrypt (of indirect PB):                   56B
-// * 2*semaphore release (indirect GPFIFO entry): 2*24B
+// * memset_8 (indirect GPFIFO entry):              44B
 // * semaphore release (indirect GPPUT):            24B
 // * semaphore release (indirect doorbell):         24B
 // Appendix added in 'uvm_channel_end_push':
 // * semaphore release (WLC tracking):             168B
-//      * semaphore increment (memcopy):            24B
+//      * semaphore release (payload):              24B
 //      * notifier memset:                          40B
 //      * payload encryption:                       64B
 //      * notifier memset:                          40B
 // * semaphore increment (LCIC GPPUT):              24B
 // * semaphore release (LCIC doorbell):             24B
-// Total:                                          368B
-#define UVM_MAX_WLC_PUSH_SIZE (368)
+// Total:                                          364B
+#define UVM_MAX_WLC_PUSH_SIZE (364)

 // Push space needed for static LCIC schedule, as initialized in
 // 'setup_lcic_schedule':
@@ -184,7 +184,7 @@
 // * semaphore increment (WLC GPPUT):      24B
 // * semaphore increment (WLC GPPUT):      24B
 // * semaphore increment (LCIC tracking): 160B
-//      * semaphore increment (memcopy):   24B
+//      * semaphore increment (payload):   24B
 //      * notifier memcopy:                36B
 //      * payload encryption:              64B
 //      * notifier memcopy:                36B
--- a/kernel-open/nvidia-uvm/uvm_sec2_test.c
+++ b/kernel-open/nvidia-uvm/uvm_sec2_test.c
@@ -213,6 +213,7 @@ done:
 typedef enum
 {
    MEM_ALLOC_TYPE_SYSMEM_DMA,
+    MEM_ALLOC_TYPE_SYSMEM_PROTECTED,
    MEM_ALLOC_TYPE_VIDMEM_PROTECTED
 } mem_alloc_type_t;

@@ -274,7 +275,11 @@ static NV_STATUS alloc_and_init_mem(uvm_gpu_t *gpu, uvm_mem_t **mem, size_t size
        TEST_NV_CHECK_GOTO(ce_memset_gpu(gpu, *mem, size, 0xdead), err);
    }
    else {
-        TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, NULL, mem));
+        if (type == MEM_ALLOC_TYPE_SYSMEM_DMA)
+            TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, NULL, mem));
+        else
+            TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem(size, NULL, mem));
+
        TEST_NV_CHECK_GOTO(uvm_mem_map_cpu_kernel(*mem), err);
        TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
        write_range_cpu(*mem, size, 0xdeaddead);
@@ -405,48 +410,6 @@ static void gpu_decrypt(uvm_push_t *push,
    }
 }

-// This test only uses sysmem so that we can use the CPU for encryption and SEC2
-// for decryption, i.e., the test doesn't depend on any other GPU engine for
-// the encryption operation (refer to test_cpu_to_gpu_roundtrip()). This is not
-// how SEC2 is used in the driver. The intended SEC2 usage is to decrypt from
-// unprotected sysmem to protected vidmem, which is tested in
-// test_cpu_to_gpu_roundtrip().
-static NV_STATUS test_cpu_to_gpu_sysmem(uvm_gpu_t *gpu, size_t copy_size, size_t size)
-{
-    NV_STATUS status = NV_OK;
-    uvm_mem_t *src_plain = NULL;
-    uvm_mem_t *cipher = NULL;
-    uvm_mem_t *dst_plain = NULL;
-    uvm_mem_t *auth_tag_mem = NULL;
-    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-    uvm_push_t push;
-
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &auth_tag_mem, auth_tag_buffer_size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
-
-    write_range_cpu(src_plain, size, uvm_get_stale_thread_id());
-    write_range_cpu(dst_plain, size, 0xA5A5A5A5);
-
-    TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_SEC2, &push, "enc(cpu)_dec(gpu)"), out);
-
-    cpu_encrypt(push.channel, cipher, src_plain, auth_tag_mem, size, copy_size);
-    gpu_decrypt(&push, dst_plain, cipher, auth_tag_mem, size, copy_size);
-
-    uvm_push_end_and_wait(&push);
-
-    TEST_CHECK_GOTO(mem_match(src_plain, dst_plain), out);
-
-out:
-    uvm_mem_free(auth_tag_mem);
-    uvm_mem_free(cipher);
-    uvm_mem_free(dst_plain);
-    uvm_mem_free(src_plain);
-
-    return status;
-}
-
 // This test depends on the CE for the encryption, so we assume tests from
 // uvm_ce_test.c have successfully passed.
 static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, size_t size)
@@ -461,19 +424,16 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    uvm_push_t push;
    UvmCslIv *decrypt_iv;
-    uvm_tracker_t tracker;

    decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
    if (!decrypt_iv)
        return NV_ERR_NO_MEMORY;

-    uvm_tracker_init(&tracker);
-
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
+    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_plain, size, MEM_ALLOC_TYPE_SYSMEM_PROTECTED), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &src_cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_cipher, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain, size, MEM_ALLOC_TYPE_VIDMEM_PROTECTED), out);
-    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain_cpu, size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);
+    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &dst_plain_cpu, size, MEM_ALLOC_TYPE_SYSMEM_PROTECTED), out);
    TEST_NV_CHECK_GOTO(alloc_and_init_mem(gpu, &auth_tag_mem, auth_tag_buffer_size, MEM_ALLOC_TYPE_SYSMEM_DMA), out);

    write_range_cpu(src_plain, size, uvm_get_stale_thread_id());
@@ -483,14 +443,13 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
    cpu_encrypt(push.channel, src_cipher, src_plain, auth_tag_mem, size, copy_size);
    gpu_decrypt(&push, dst_plain, src_cipher, auth_tag_mem, size, copy_size);

-    uvm_push_end(&push);
-    TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), out);

-    TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
-                                              UVM_CHANNEL_TYPE_GPU_TO_CPU,
-                                              &tracker,
-                                              &push,
-                                              "enc(gpu)_dec(cpu)"),
+    // Wait for SEC2 before launching the CE part.
+    // SEC2 is only allowed to release semaphores in unprotected sysmem,
+    // and CE can only acquire semaphores in protected vidmem.
+    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);
+
+    TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "enc(gpu)_dec(cpu)"),
                       out);

    gpu_encrypt(&push, dst_cipher, dst_plain, decrypt_iv, auth_tag_mem, size, copy_size);
@@ -521,8 +480,6 @@ out:

    uvm_kvfree(decrypt_iv);

-    uvm_tracker_deinit(&tracker);
-
    return status;
 }

@@ -545,7 +502,6 @@ static NV_STATUS test_encryption_decryption(uvm_gpu_t *gpu)

        UVM_ASSERT(size % copy_sizes[i] == 0);

-        TEST_NV_CHECK_RET(test_cpu_to_gpu_sysmem(gpu, copy_sizes[i], size));
        TEST_NV_CHECK_RET(test_cpu_to_gpu_roundtrip(gpu, copy_sizes[i], size));
    }

--- a/kernel-open/nvidia-uvm/uvm_tracker_test.c
+++ b/kernel-open/nvidia-uvm/uvm_tracker_test.c
@@ -69,6 +69,14 @@ static NV_STATUS test_tracker_completion(uvm_va_space_t *va_space)
    gpu = uvm_va_space_find_first_gpu(va_space);
    TEST_CHECK_RET(gpu != NULL);

+    // TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore mechanism
+    //                     to all semaphore
+    // This test allocates semaphore in vidmem and then releases it from the CPU
+    // SEC2 channels cannot target semaphores in vidmem. Moreover, CPU cannot
+    // directly release values to vidmem for CE channels.
+    if (uvm_conf_computing_mode_enabled(gpu))
+        return NV_OK;
+
    TEST_NV_CHECK_RET(uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema));

    uvm_tracker_init(&tracker);
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -7189,6 +7189,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
 }

 static void map_get_allowed_destinations(uvm_va_block_t *block,
+                                         uvm_va_block_context_t *va_block_context,
                                         const uvm_va_policy_t *policy,
                                         uvm_processor_id_t id,
                                         uvm_processor_mask_t *allowed_mask)
@@ -7200,7 +7201,10 @@ static void map_get_allowed_destinations(uvm_va_block_t *block,
        uvm_processor_mask_zero(allowed_mask);
        uvm_processor_mask_set(allowed_mask, policy->preferred_location);
    }
-    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) || uvm_id_equal(policy->preferred_location, id)) &&
+    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
+              (uvm_id_equal(policy->preferred_location, id) &&
+               !is_uvm_fault_force_sysmem_set() &&
+               !uvm_hmm_must_use_sysmem(block, va_block_context))) &&
             uvm_va_space_processor_has_memory(va_space, id)) {
        // When operating under read-duplication we should only map the local
        // processor to cause fault-and-duplicate of remote pages.
@@ -7285,7 +7289,7 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,

    // Map per resident location so we can more easily detect physically-
    // contiguous mappings.
-    map_get_allowed_destinations(va_block, va_block_context->policy, id, &allowed_destinations);
+    map_get_allowed_destinations(va_block, va_block_context, va_block_context->policy, id, &allowed_destinations);

    for_each_closest_id(resident_id, &allowed_destinations, id, va_space) {
        if (UVM_ID_IS_CPU(id)) {
--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -418,15 +418,6 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
    uvm_global_processor_mask_t retained_gpus;
    LIST_HEAD(deferred_free_list);

-    // Normally we'd expect this to happen as part of uvm_mm_release()
-    // but if userspace never initialized uvm_mm_fd that won't happen.
-    // We don't have to take the va_space_mm spinlock and update state
-    // here because we know no other thread can be in or subsequently
-    // call uvm_api_mm_initialize successfully because the UVM
-    // file-descriptor has been released.
-    if (va_space->va_space_mm.state == UVM_VA_SPACE_MM_STATE_UNINITIALIZED)
-        uvm_va_space_mm_unregister(va_space);
-
    // Remove the VA space from the global list before we start tearing things
    // down so other threads can't see the VA space in a partially-valid state.
    uvm_mutex_lock(&g_uvm_global.va_spaces.lock);
@@ -532,7 +523,14 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)

    uvm_deferred_free_object_list(&deferred_free_list);

-    // MM FD teardown should already have destroyed va_space_mm
+    // Normally we'd expect this to happen as part of uvm_mm_release()
+    // but if userspace never initialized uvm_mm_fd that won't happen.
+    // We don't have to take the va_space_mm spinlock and update state
+    // here because we know no other thread can be in or subsequently
+    // call uvm_api_mm_initialize successfully because the UVM
+    // file-descriptor has been released.
+    if (va_space->va_space_mm.state == UVM_VA_SPACE_MM_STATE_UNINITIALIZED)
+        uvm_va_space_mm_unregister(va_space);
    UVM_ASSERT(!uvm_va_space_mm_alive(&va_space->va_space_mm));

    uvm_mutex_lock(&g_uvm_global.global_lock);
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -1396,6 +1396,8 @@ static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)

    nv->flags |= NV_FLAG_OPEN;

+    rm_request_dnotifier_state(sp, nv);
+
    /*
     * Now that RM init is done, allow dynamic power to control the GPU in FINE
     * mode, if enabled.  (If the mode is COARSE, this unref will do nothing
--- a/kernel-open/nvidia/nv_gpu_ops.h
+++ b/kernel-open/nvidia/nv_gpu_ops.h
@@ -290,10 +290,6 @@ NV_STATUS nvGpuOpsFlushReplayableFaultBuffer(struct gpuDevice *device);
 NV_STATUS nvGpuOpsCcslContextInit(struct ccslContext_t **ctx,
                                  gpuChannelHandle channel);
 NV_STATUS nvGpuOpsCcslContextClear(struct ccslContext_t *ctx);
-NV_STATUS nvGpuOpsCcslLogDeviceEncryption(struct ccslContext_t *ctx,
-                                          NvU8 *decryptIv);
-NV_STATUS nvGpuOpsCcslAcquireEncryptionIv(struct ccslContext_t *ctx,
-                                          NvU8 *encryptIv);
 NV_STATUS nvGpuOpsCcslRotateIv(struct ccslContext_t *ctx,
                               NvU8 direction);
 NV_STATUS nvGpuOpsCcslEncrypt(struct ccslContext_t *ctx,
@@ -312,6 +308,8 @@ NV_STATUS nvGpuOpsCcslDecrypt(struct ccslContext_t *ctx,
                              NvU8 const *inputBuffer,
                              NvU8 const *decryptIv,
                              NvU8 *outputBuffer,
+                              NvU8 const *addAuthData,
+                              NvU32 addAuthDataSize,
                              NvU8 const *authTagBuffer);
 NV_STATUS nvGpuOpsCcslSign(struct ccslContext_t *ctx,
                           NvU32 bufferSize,
@@ -320,5 +318,9 @@ NV_STATUS nvGpuOpsCcslSign(struct ccslContext_t *ctx,
 NV_STATUS nvGpuOpsQueryMessagePool(struct ccslContext_t *ctx,
                                   NvU8 direction,
                                   NvU64 *messageNum);
+NV_STATUS nvGpuOpsIncrementIv(struct ccslContext_t *ctx,
+                              NvU8 direction,
+                              NvU64 increment,
+                              NvU8 *iv);

 #endif /* _NV_GPU_OPS_H_*/
--- a/kernel-open/nvidia/nv_uvm_interface.c
+++ b/kernel-open/nvidia/nv_uvm_interface.c
@@ -1504,44 +1504,18 @@ void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext)
 }
 EXPORT_SYMBOL(nvUvmInterfaceDeinitCslContext);

-NV_STATUS nvUvmInterfaceCslLogDeviceEncryption(UvmCslContext *uvmCslContext,
-                                               UvmCslIv *decryptIv)
-{
-    NV_STATUS status;
-    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;
-
-    status = rm_gpu_ops_ccsl_log_device_encryption(sp, uvmCslContext->ctx, (NvU8 *)decryptIv);
-
-    return status;
-}
-EXPORT_SYMBOL(nvUvmInterfaceCslLogDeviceEncryption);
-
 NV_STATUS nvUvmInterfaceCslRotateIv(UvmCslContext *uvmCslContext,
-                                    UvmCslDirection direction)
+                                    UvmCslOperation operation)
 {
    NV_STATUS status;
    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;

-    status = rm_gpu_ops_ccsl_rotate_iv(sp, uvmCslContext->ctx, direction);
+    status = rm_gpu_ops_ccsl_rotate_iv(sp, uvmCslContext->ctx, operation);

    return status;
 }
 EXPORT_SYMBOL(nvUvmInterfaceCslRotateIv);

-NV_STATUS nvUvmInterfaceCslAcquireEncryptionIv(UvmCslContext *uvmCslContext,
-                                               UvmCslIv *encryptIv)
-{
-    NV_STATUS status;
-    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;
-
-    BUILD_BUG_ON(NV_OFFSETOF(UvmCslIv, fresh) != sizeof(encryptIv->iv));
-
-    status = rm_gpu_ops_ccsl_acquire_encryption_iv(sp, uvmCslContext->ctx, (NvU8*)encryptIv);
-
-    return status;
-}
-EXPORT_SYMBOL(nvUvmInterfaceCslAcquireEncryptionIv);
-
 NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,
                                   NvU32 bufferSize,
                                   NvU8 const *inputBuffer,
@@ -1566,6 +1540,8 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
                                   NvU8 const *inputBuffer,
                                   UvmCslIv const *decryptIv,
                                   NvU8 *outputBuffer,
+                                   NvU8 const *addAuthData,
+                                   NvU32 addAuthDataSize,
                                   NvU8 const *authTagBuffer)
 {
    NV_STATUS status;
@@ -1577,6 +1553,8 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
                                     inputBuffer,
                                     (NvU8 *)decryptIv,
                                     outputBuffer,
+                                     addAuthData,
+                                     addAuthDataSize,
                                     authTagBuffer);

    return status;
@@ -1598,18 +1576,32 @@ NV_STATUS nvUvmInterfaceCslSign(UvmCslContext *uvmCslContext,
 EXPORT_SYMBOL(nvUvmInterfaceCslSign);

 NV_STATUS nvUvmInterfaceCslQueryMessagePool(UvmCslContext *uvmCslContext,
-                                            UvmCslDirection direction,
+                                            UvmCslOperation operation,
                                            NvU64 *messageNum)
 {
    NV_STATUS status;
    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;

-    status = rm_gpu_ops_ccsl_query_message_pool(sp, uvmCslContext->ctx, direction, messageNum);
+    status = rm_gpu_ops_ccsl_query_message_pool(sp, uvmCslContext->ctx, operation, messageNum);

    return status;
 }
 EXPORT_SYMBOL(nvUvmInterfaceCslQueryMessagePool);

+NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
+                                       UvmCslOperation operation,
+                                       NvU64 increment,
+                                       UvmCslIv *iv)
+{
+    NV_STATUS status;
+    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;
+
+    status = rm_gpu_ops_ccsl_increment_iv(sp, uvmCslContext->ctx, operation, increment, (NvU8 *)iv);
+
+    return status;
+}
+EXPORT_SYMBOL(nvUvmInterfaceCslIncrementIv);
+
 #else // NV_UVM_ENABLE

 NV_STATUS nv_uvm_suspend(void)