535.43.02

2026-02-19 22:44:19 +00:00 · 2023-05-30 10:11:36 -07:00
parent 6dd092ddb7
commit eb5c7665a1
1403 changed files with 295367 additions and 86235 deletions
--- a/kernel-open/nvidia-uvm/uvm_hopper_ce.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_ce.c
@@ -24,6 +24,7 @@
 #include "uvm_hal.h"
 #include "uvm_push.h"
 #include "uvm_mem.h"
+#include "uvm_conf_computing.h"
 #include "clc8b5.h"

 static NvU32 ce_aperture(uvm_aperture_t aperture)
@@ -97,7 +98,8 @@ void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 p

    NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
       HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
-       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
+       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
+       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) |
       launch_dma_plc_mode);
 }

@@ -114,7 +116,8 @@ void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, N

    NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
       HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
-       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
+       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
+       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) |
       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) |
       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_SIGN, UNSIGNED) |
       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_ENABLE, TRUE) |
@@ -135,7 +138,8 @@ void uvm_hal_hopper_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)

    NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
       HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
-       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_FOUR_WORD_SEMAPHORE) |
+       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
+       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_WITH_TIMESTAMP) |
       launch_dma_plc_mode);
 }

@@ -148,12 +152,46 @@ static NvU32 hopper_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t ds
    return HWCONST(C8B5, LAUNCH_DMA, DST_TYPE, PHYSICAL);
 }

-static bool hopper_scrub_enable(uvm_gpu_address_t dst, size_t size)
+static bool va_is_flat_vidmem(uvm_gpu_t *gpu, NvU64 va)
 {
-    return !dst.is_virtual &&
-           dst.aperture == UVM_APERTURE_VID &&
-           IS_ALIGNED(dst.address, UVM_PAGE_SIZE_4K) &&
-           IS_ALIGNED(size, UVM_PAGE_SIZE_4K);
+    return (uvm_mmu_gpu_needs_static_vidmem_mapping(gpu) || uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu)) &&
+           va >= gpu->parent->flat_vidmem_va_base &&
+           va < gpu->parent->flat_vidmem_va_base + UVM_GPU_MAX_PHYS_MEM;
+}
+
+// Return whether a memset should use the fast scrubber. If so, convert dst to
+// the address needed by the fast scrubber.
+static bool hopper_scrub_enable(uvm_gpu_t *gpu, uvm_gpu_address_t *dst, size_t size)
+{
+    if (!IS_ALIGNED(dst->address, UVM_PAGE_SIZE_4K) || !IS_ALIGNED(size, UVM_PAGE_SIZE_4K))
+        return false;
+
+    // When CE physical writes are disallowed, higher layers will convert
+    // physical memsets to virtual using the flat mapping. Those layers are
+    // unaware of the fast scrubber, which is safe to use specifically when CE
+    // physical access is disallowed. Detect such memsets within the flat vidmem
+    // region and convert them back to physical, since the fast scrubber only
+    // works with physical addressing.
+    if (dst->is_virtual && !gpu->parent->ce_phys_vidmem_write_supported && va_is_flat_vidmem(gpu, dst->address)) {
+        *dst = uvm_gpu_address_physical(UVM_APERTURE_VID, dst->address - gpu->parent->flat_vidmem_va_base);
+        return true;
+    }
+
+    return !dst->is_virtual && dst->aperture == UVM_APERTURE_VID;
+}
+
+static NvU32 hopper_memset_copy_type(uvm_push_t *push, uvm_gpu_address_t dst)
+{
+    if (uvm_conf_computing_mode_enabled(uvm_push_get_gpu(push)) && dst.is_unprotected)
+        return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT);
+    return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT);
+}
+
+NvU32 uvm_hal_hopper_ce_memcopy_copy_type(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
+{
+    if (uvm_conf_computing_mode_enabled(uvm_push_get_gpu(push)) && dst.is_unprotected && src.is_unprotected)
+        return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT);
+    return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT);
 }

 static void hopper_memset_common(uvm_push_t *push,
@@ -172,8 +210,10 @@ static void hopper_memset_common(uvm_push_t *push,
    NvU32 launch_dma_remap_enable;
    NvU32 launch_dma_scrub_enable;
    NvU32 flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
+    NvU32 copy_type_value = hopper_memset_copy_type(push, dst);
+    bool is_scrub = hopper_scrub_enable(gpu, &dst, num_elements * memset_element_size);

-    UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, memset_element_size),
+    UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, num_elements, memset_element_size),
                   "Memset validation failed in channel %s, GPU %s",
                   push->channel->name,
                   uvm_gpu_name(gpu));
@@ -186,7 +226,7 @@ static void hopper_memset_common(uvm_push_t *push,
    else
        pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);

-    if (memset_element_size == 8 && hopper_scrub_enable(dst, num_elements * memset_element_size)) {
+    if (memset_element_size == 8 && is_scrub) {
        launch_dma_remap_enable = HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE);
        launch_dma_scrub_enable = HWCONST(C8B5, LAUNCH_DMA, MEMORY_SCRUB_ENABLE, TRUE);

@@ -223,6 +263,7 @@ static void hopper_memset_common(uvm_push_t *push,
           launch_dma_scrub_enable |
           launch_dma_dst_type |
           launch_dma_plc_mode |
+           copy_type_value |
           pipelined_value);

        dst.address += memset_this_time * memset_element_size;
@@ -250,7 +291,7 @@ void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 v

 void uvm_hal_hopper_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size)
 {
-    if (hopper_scrub_enable(dst, size)) {
+    if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) {
        NvU64 value64 = value;

        value64 |= value64 << 8;
@@ -274,7 +315,7 @@ void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 v
 {
    UVM_ASSERT_MSG(size % 4 == 0, "size: %zd\n", size);

-    if (hopper_scrub_enable(dst, size)) {
+    if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) {
        NvU64 value64 = value;

        value64 |= value64 << 32;
@@ -294,15 +335,235 @@ void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 v
    hopper_memset_common(push, dst, size, 4);
 }

-bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
+bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push,
+                                       uvm_gpu_address_t dst,
+                                       size_t num_elements,
+                                       size_t element_size)
 {
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    // In HCC, if a memset uses physical addressing for the destination, then
+    // it must write to (protected) vidmem. If the memset uses virtual
+    // addressing, and the backing storage is not vidmem, the access is only
+    // legal if the copy type is NONPROT2NONPROT, and the destination is
+    // unprotected sysmem, but the validation does not detect it.
+    if (uvm_conf_computing_mode_is_hcc(gpu) && !dst.is_virtual && dst.aperture != UVM_APERTURE_VID)
+        return false;
+
+    if (!gpu->parent->ce_phys_vidmem_write_supported) {
+        size_t size = num_elements * element_size;
+        uvm_gpu_address_t temp = dst;
+
+        // Physical vidmem writes are disallowed, unless using the scrubber
+        if (!dst.is_virtual && dst.aperture == UVM_APERTURE_VID && !hopper_scrub_enable(gpu, &temp, size)) {
+            UVM_ERR_PRINT("Destination address of vidmem memset must be virtual, not physical: {%s, 0x%llx} size %zu\n",
+                          uvm_gpu_address_aperture_string(dst),
+                          dst.address,
+                          size);
+            return false;
+        }
+    }

    return true;
 }

 bool uvm_hal_hopper_ce_memcopy_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
 {
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    if (uvm_conf_computing_mode_is_hcc(gpu)) {
+        // In HCC, if a memcopy uses physical addressing for either the
+        // destination or the source, then the corresponding aperture must be
+        // vidmem. If virtual addressing is used, and the backing storage is
+        // sysmem the access is only legal if the copy type is NONPROT2NONPROT,
+        // but the validation does not detect it. In other words the copy
+        // source and destination is unprotected sysmem.
+        if (!src.is_virtual && (src.aperture != UVM_APERTURE_VID))
+            return false;
+
+        if (!dst.is_virtual && (dst.aperture != UVM_APERTURE_VID))
+            return false;
+
+        if (dst.is_unprotected != src.is_unprotected)
+            return false;
+    }
+
+    if (!gpu->parent->ce_phys_vidmem_write_supported && !dst.is_virtual && dst.aperture == UVM_APERTURE_VID) {
+        UVM_ERR_PRINT("Destination address of vidmem memcopy must be virtual, not physical: {%s, 0x%llx}\n",
+                      uvm_gpu_address_aperture_string(dst),
+                      dst.address);
+        return false;
+    }

    return true;
 }

+// Specialized version of uvm_hal_volta_ce_memcopy used for encryption and
+// decryption. Pre-Hopper functionality, such as validation or address patching,
+// has been removed.
+static void encrypt_or_decrypt(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, NvU32 size)
+{
+    NvU32 pipelined_value;
+    NvU32 launch_dma_src_dst_type;
+    NvU32 launch_dma_plc_mode;
+    NvU32 flush_value;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    // HW allows unaligned operations only if the entire buffer is in one 32B
+    // sector. Operations on buffers larger than 32B have to be aligned.
+    if (size > UVM_CONF_COMPUTING_BUF_ALIGNMENT) {
+        UVM_ASSERT(IS_ALIGNED(src.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT));
+        UVM_ASSERT(IS_ALIGNED(dst.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT));
+    }
+    else {
+        UVM_ASSERT((dst.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) ==
+                   ((dst.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT));
+        UVM_ASSERT((src.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) ==
+                   ((src.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT));
+    }
+
+    launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src);
+    launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
+
+    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
+        pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
+    else
+        pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
+
+    flush_value = hopper_get_flush_value(push);
+
+    gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address);
+
+    NV_PUSH_1U(C8B5, LINE_LENGTH_IN, size);
+
+    NV_PUSH_1U(C8B5, LAUNCH_DMA, HWCONST(C8B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
+                                 HWCONST(C8B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
+                                 HWCONST(C8B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
+                                 HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE) |
+                                 HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, SECURE) |
+                                 flush_value |
+                                 launch_dma_src_dst_type |
+                                 launch_dma_plc_mode |
+                                 pipelined_value);
+}
+
+// The GPU CE encrypt operation requires clients to pass a valid
+// address where the used IV will be written. But this requirement is
+// unnecessary, because UVM should instead rely on the CSL
+// nvUvmInterfaceCslLogDeviceEncryption API to independently track
+// the expected IV.
+//
+// To satisfy the HW requirement the same unprotected sysmem address is
+// passed to all GPU-side encryptions. This dummy buffer is allocated at
+// GPU initialization time.
+static NvU64 encrypt_iv_address(uvm_push_t *push, uvm_gpu_address_t dst)
+{
+    NvU64 iv_address;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    // Match addressing mode of destination and IV
+    if (dst.is_virtual) {
+        iv_address = uvm_rm_mem_get_gpu_va(gpu->conf_computing.iv_rm_mem, gpu, false).address;
+    }
+    else {
+        iv_address = uvm_mem_gpu_physical(gpu->conf_computing.iv_mem,
+                                          gpu,
+                                          0,
+                                          gpu->conf_computing.iv_mem->size).address;
+    }
+
+    UVM_ASSERT(IS_ALIGNED(iv_address, UVM_CONF_COMPUTING_IV_ALIGNMENT));
+
+    return iv_address;
+}
+
+// TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
+void uvm_hal_hopper_ce_encrypt(uvm_push_t *push,
+                               uvm_gpu_address_t dst,
+                               uvm_gpu_address_t src,
+                               NvU32 size,
+                               uvm_gpu_address_t auth_tag)
+{
+
+    NvU32 auth_tag_address_hi32, auth_tag_address_lo32;
+    NvU64 iv_address;
+    NvU32 iv_address_hi32, iv_address_lo32;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu));
+    UVM_ASSERT(uvm_push_is_fake(push) || uvm_channel_is_secure(push->channel));
+    UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));
+
+    if (!src.is_virtual)
+        UVM_ASSERT(src.aperture == UVM_APERTURE_VID);
+
+    // The addressing mode (and aperture, if applicable) of the destination
+    // pointer determines the addressing mode and aperture used by the
+    // encryption to reference the other two addresses written by it:
+    // authentication tag, and IV. If the client passes a sysmem physical
+    // address as destination, then the authentication tag must also be a sysmem
+    // physical address.
+    UVM_ASSERT(dst.is_virtual == auth_tag.is_virtual);
+
+    if (!dst.is_virtual) {
+        UVM_ASSERT(dst.aperture == UVM_APERTURE_SYS);
+        UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS);
+    }
+
+    NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, ENCRYPT));
+
+    auth_tag_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address));
+    auth_tag_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address));
+
+    iv_address = encrypt_iv_address(push, dst);
+
+    iv_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_UPPER, UPPER, NvU64_HI32(iv_address));
+    iv_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_LOWER, LOWER, NvU64_LO32(iv_address));
+
+    NV_PUSH_4U(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, auth_tag_address_hi32,
+                     SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, auth_tag_address_lo32,
+                     SET_ENCRYPT_IV_ADDR_UPPER, iv_address_hi32,
+                     SET_ENCRYPT_IV_ADDR_LOWER, iv_address_lo32);
+
+    encrypt_or_decrypt(push, dst, src, size);
+}
+
+// TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
+void uvm_hal_hopper_ce_decrypt(uvm_push_t *push,
+                               uvm_gpu_address_t dst,
+                               uvm_gpu_address_t src,
+                               NvU32 size,
+                               uvm_gpu_address_t auth_tag)
+{
+
+    NvU32 auth_tag_address_hi32, auth_tag_address_lo32;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu));
+    UVM_ASSERT(!push->channel || uvm_channel_is_secure(push->channel));
+    UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));
+
+    // The addressing mode (and aperture, if applicable) of the source and
+    // authentication pointers should match. But unlike in the encryption case,
+    // clients are not forced to pass a valid IV address.
+    UVM_ASSERT(src.is_virtual == auth_tag.is_virtual);
+
+    if (!src.is_virtual) {
+        UVM_ASSERT(src.aperture == UVM_APERTURE_SYS);
+        UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS);
+    }
+
+    if (!dst.is_virtual)
+        UVM_ASSERT(dst.aperture == UVM_APERTURE_VID);
+
+    NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, DECRYPT));
+
+    auth_tag_address_hi32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address));
+    auth_tag_address_lo32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address));
+
+    NV_PUSH_2U(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, auth_tag_address_hi32,
+                     SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, auth_tag_address_lo32);
+
+    encrypt_or_decrypt(push, dst, src, size);
+}
+