570.86.15

2026-02-04 15:19:59 +00:00 · 2025-01-27 19:36:56 +01:00
parent 9d0b0414a5
commit 54d69484da
1166 changed files with 318863 additions and 182687 deletions
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -97,6 +97,11 @@ struct uvm_service_block_context_struct
    // been serviced
    uvm_processor_mask_t resident_processors;

+    // A mask of GPUs that need to be checked for NVLINK errors before the
+    // handler returns, but after the VA space lock has been unlocked
+    // to avoid RM/UVM VA space lock deadlocks.
+    uvm_processor_mask_t gpus_to_check_for_nvlink_errors;
+
    // VA block region that contains all the pages affected by the operation
    uvm_va_block_region_t region;

@@ -192,6 +197,10 @@ typedef struct
    {
        struct
        {
+            // Mask of prefetch faulted pages in a UVM_VA_BLOCK_SIZE aligned region
+            // of a SAM VMA. Used for batching ATS faults in a vma.
+            uvm_page_mask_t prefetch_only_fault_mask;
+
            // Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region
            // of a SAM VMA. Used for batching ATS faults in a vma.
            uvm_page_mask_t read_fault_mask;
@@ -202,7 +211,7 @@ typedef struct

            // Mask of all faulted pages in a UVM_VA_BLOCK_SIZE aligned region
            // of a SAM VMA. This is a logical or of read_fault_mask and
-            // write_mask.
+            // write_mask and prefetch_only_fault_mask.
            uvm_page_mask_t accessed_mask;

            // Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE
@@ -269,7 +278,6 @@ typedef struct
        // Prefetch temporary state.
        uvm_perf_prefetch_bitmap_tree_t bitmap_tree;
    } prefetch_state;
-
 } uvm_ats_fault_context_t;

 struct uvm_fault_service_batch_context_struct
@@ -701,7 +709,7 @@ struct uvm_gpu_struct
            // True if the platform supports HW coherence and the GPU's memory
            // is exposed as a NUMA node to the kernel.
            bool enabled;
-            unsigned int node_id;
+            int node_id;
        } numa;

        // Physical address of the start of statically mapped fb memory in BAR1
@@ -871,6 +879,38 @@ struct uvm_gpu_struct
        NvBool *error_notifier;
    } ecc;

+    // NVLINK STO recovery handling
+    // In order to trap STO errors as soon as possible the driver has the hw
+    // interrupt register mapped directly. If an STO interrupt is ever noticed
+    // to be pending, then the UVM driver needs to:
+    //
+    //   1) ask RM to service interrupts, and then
+    //   2) inspect the NVLINK error notifier state.
+    //
+    // Notably, checking for channel errors is not enough, because STO errors
+    // can be pending, even after a channel has become idle.
+    //
+    // See more details in uvm_gpu_check_nvlink_error().
+    struct
+    {
+        // Does the GPU have NVLINK STO recovery enabled?
+        bool enabled;
+
+        // Artificially injected error for testing
+        atomic_t injected_error;
+
+        // Direct mapping of the 32-bit part of the hw interrupt tree that has
+        // the NVLINK error bits.
+        volatile NvU32 *hw_interrupt_tree_location;
+
+        // Mask to get the NVLINK error interrupt bits from the 32-bits above.
+        NvU32 mask;
+
+        // Set to true by RM when a fatal NVLINK error is encountered (requires
+        // asking RM to service pending interrupts to be current).
+        NvBool *error_notifier;
+    } nvlink_status;
+
    struct
    {
        NvU32 swizz_id;
@@ -1001,6 +1041,8 @@ struct uvm_parent_gpu_struct
    // Whether CE supports physical addressing mode for writes to vidmem
    bool ce_phys_vidmem_write_supported;

+    // Addressing mode(s) supported for CE transfers between this GPU and its
+    // peers: none, physical only, physical and virtual, etc.
    uvm_gpu_peer_copy_mode_t peer_copy_mode;

    // Virtualization mode of the GPU.
@@ -1090,6 +1132,15 @@ struct uvm_parent_gpu_struct
    // Indicates whether the GPU can map sysmem with pages larger than 4k
    bool can_map_sysmem_with_large_pages;

+    struct
+    {
+        // If true, the granularity of key rotation is a single channel. If
+        // false, the key replacement affects all channels on the engine. The
+        // supported granularity is dependent on the number of key slots
+        // available in HW.
+        bool per_channel_key_rotation;
+    } conf_computing;
+
    // VA base and size of the RM managed part of the internal UVM VA space.
    //
    // The internal UVM VA is shared with RM by RM controlling some of the top
@@ -1102,6 +1153,11 @@ struct uvm_parent_gpu_struct
    NvU64 rm_va_base;
    NvU64 rm_va_size;

+    // Base and size of the GPU VA space used for peer identity mappings,
+    // it is used only if peer_copy_mode is UVM_GPU_PEER_COPY_MODE_VIRTUAL.
+    NvU64 peer_va_base;
+    NvU64 peer_va_size;
+
    // Base and size of the GPU VA used for uvm_mem_t allocations mapped in the
    // internal address_space_tree.
    NvU64 uvm_mem_va_base;
@@ -1260,6 +1316,22 @@ struct uvm_parent_gpu_struct
        unsigned long smmu_prod;
        unsigned long smmu_cons;
    } smmu_war;
+
+    struct
+    {
+        // Is EGM support enabled on this GPU.
+        bool enabled;
+
+        // Local EGM peer ID. This ID is used to route EGM memory accesses to
+        // the local CPU socket.
+        NvU8 local_peer_id;
+
+        // EGM base address of the EGM carveout for remote EGM accesses.
+        // The base address is used when computing PTE PA address values for
+        // accesses to the local CPU socket's EGM memory from other peer
+        // GPUs.
+        NvU64 base_address;
+    } egm;
 };

 static const char *uvm_parent_gpu_name(uvm_parent_gpu_t *parent_gpu)
@@ -1330,6 +1402,18 @@ typedef struct
    // peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2)
    NvU8 peer_ids[2];

+    // EGM peer Id associated with this device w.r.t. a peer GPU.
+    // Note: egmPeerId (A -> B) != egmPeerId (B -> A)
+    // egm_peer_id[0] from min(gpu_id_1, gpu_id_2) -> max(gpu_id_1, gpu_id_2)
+    // egm_peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2)
+    //
+    // Unlike VIDMEM peers, EGM peers are not symmetric. This means that if
+    // one of the GPUs is EGM-enabled, it does not automatically mean that
+    // the other is also EGM-enabled. Therefore, an EGM peer Ids are only
+    // valid if the peer GPU is EGM-enabled, i.e. egm_peer_id[0] is valid
+    // iff max(gpu_id_1, gpu_id_2) is EGM-enabled.
+    NvU8 egm_peer_ids[2];
+
    // The link type between the peer parent GPUs, currently either PCIe or
    // NVLINK.
    uvm_gpu_link_type_t link_type;
@@ -1372,7 +1456,9 @@ void uvm_gpu_exit_va_space(uvm_va_space_t *va_space);

 static unsigned int uvm_gpu_numa_node(uvm_gpu_t *gpu)
 {
-    UVM_ASSERT(gpu->mem_info.numa.enabled);
+    if (!gpu->mem_info.numa.enabled)
+        UVM_ASSERT(gpu->mem_info.numa.node_id == NUMA_NO_NODE);
+
    return gpu->mem_info.numa.node_id;
 }

@@ -1381,6 +1467,7 @@ static uvm_gpu_phys_address_t uvm_gpu_page_to_phys_address(uvm_gpu_t *gpu, struc
    unsigned long sys_addr = page_to_pfn(page) << PAGE_SHIFT;
    unsigned long gpu_offset = sys_addr - gpu->parent->system_bus.memory_window_start;

+    UVM_ASSERT(gpu->mem_info.numa.enabled);
    UVM_ASSERT(page_to_nid(page) == uvm_gpu_numa_node(gpu));
    UVM_ASSERT(sys_addr >= gpu->parent->system_bus.memory_window_start);
    UVM_ASSERT(sys_addr + PAGE_SIZE - 1 <= gpu->parent->system_bus.memory_window_end);
@@ -1459,6 +1546,18 @@ uvm_gpu_link_type_t uvm_parent_gpu_peer_link_type(uvm_parent_gpu_t *parent_gpu0,
 // They must not be the same gpu.
 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);

+// Returns the physical address for use by accessing_gpu of a vidmem allocation
+// on the peer owning_gpu. This address can be used for making PTEs on
+// accessing_gpu, but not for copying between the two GPUs. For that, use
+// uvm_gpu_peer_copy_address.
+uvm_gpu_phys_address_t uvm_gpu_peer_phys_address(uvm_gpu_t *owning_gpu, NvU64 address, uvm_gpu_t *accessing_gpu);
+
+// Returns the physical or virtual address for use by accessing_gpu to copy to/
+// from a vidmem allocation on the peer owning_gpu. This may be different from
+// uvm_gpu_peer_phys_address to handle CE limitations in addressing peer
+// physical memory directly.
+uvm_gpu_address_t uvm_gpu_peer_copy_address(uvm_gpu_t *owning_gpu, NvU64 address, uvm_gpu_t *accessing_gpu);
+
 // Return the reference count for the P2P state between the given GPUs.
 // The two GPUs must have different parents.
 NvU64 uvm_gpu_peer_ref_count(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1);
@@ -1467,6 +1566,13 @@ NvU64 uvm_gpu_peer_ref_count(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1);
 // address.
 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);

+// Get the EGM aperture for local_gpu to use to map memory resident on the CPU
+// NUMA node that remote_gpu is attached to.
+// Note that local_gpu can be equal to remote_gpu when memory is resident in
+// CPU NUMA node local to local_gpu. In this case, the local EGM peer ID will
+// be used.
+uvm_aperture_t uvm_gpu_egm_peer_aperture(uvm_parent_gpu_t *local_gpu, uvm_parent_gpu_t *remote_gpu);
+
 bool uvm_parent_gpus_are_nvswitch_connected(const uvm_parent_gpu_t *parent_gpu0, const uvm_parent_gpu_t *parent_gpu1);

 static bool uvm_gpus_are_smc_peers(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
@@ -1508,8 +1614,8 @@ static uvm_gpu_address_t uvm_parent_gpu_address_virtual_from_sysmem_phys(uvm_par
    return uvm_gpu_address_virtual(parent_gpu->flat_sysmem_va_base + pa);
 }

-// Given a GPU or CPU physical address (not peer), retrieve an address suitable
-// for CE access.
+// Given a GPU, CPU, or EGM PEER physical address (not VIDMEM peer), retrieve an
+// address suitable for CE access.
 static uvm_gpu_address_t uvm_gpu_address_copy(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
 {
    UVM_ASSERT(phys_addr.aperture == UVM_APERTURE_VID || phys_addr.aperture == UVM_APERTURE_SYS);
@@ -1531,6 +1637,12 @@ static uvm_gpu_identity_mapping_t *uvm_gpu_get_peer_mapping(uvm_gpu_t *gpu, uvm_
    return &gpu->peer_mappings[uvm_id_gpu_index(peer_id)];
 }

+// Check whether the provided address points to peer memory:
+// * Physical address using one of the PEER apertures
+// * Physical address using SYS aperture that belongs to an exposed coherent memory
+// * Virtual address in the region [peer_va_base, peer_va_base + peer_va_size)
+bool uvm_gpu_address_is_peer(uvm_gpu_t *gpu, uvm_gpu_address_t address);
+
 // Check for ECC errors
 //
 // Notably this check cannot be performed where it's not safe to call into RM.
@@ -1543,6 +1655,23 @@ NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);
 // and it's required to call uvm_gpu_check_ecc_error() to be sure.
 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);

+// Check for NVLINK errors
+//
+// Inject NVLINK error
+NV_STATUS uvm_gpu_inject_nvlink_error(uvm_gpu_t *gpu, UVM_TEST_NVLINK_ERROR_TYPE error_type);
+
+NV_STATUS uvm_gpu_get_injected_nvlink_error(uvm_gpu_t *gpu);
+
+// Notably this check cannot be performed where it's not safe to call into RM.
+NV_STATUS uvm_gpu_check_nvlink_error(uvm_gpu_t *gpu);
+
+// Check for NVLINK errors without calling into RM
+//
+// Calling into RM is problematic in many places, this check is always safe to
+// do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an NVLINK error
+// and it's required to call uvm_gpu_check_nvlink_error() to be sure.
+NV_STATUS uvm_gpu_check_nvlink_error_no_rm(uvm_gpu_t *gpu);
+
 // Map size bytes of contiguous sysmem on the GPU for physical access
 //
 // size has to be aligned to PAGE_SIZE.