570.86.15

2026-02-08 00:59:58 +00:00 · 2025-01-27 19:36:56 +01:00
parent 9d0b0414a5
commit 54d69484da
1166 changed files with 318863 additions and 182687 deletions
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -50,7 +50,6 @@
 #include "nvlink_caps.h"

 #include "nv-hypervisor.h"
-#include "nv-ibmnpu.h"
 #include "nv-rsync.h"
 #include "nv-kthread-q.h"
 #include "nv-pat.h"
@@ -127,7 +126,11 @@ MODULE_ALIAS_CHARDEV_MAJOR(NV_MAJOR_DEVICE_NUMBER);
 * DMA_BUF namespace is added by commit id 16b0314aa746
 * ("dma-buf: move dma-buf symbols into the DMA_BUF module namespace") in 5.16
 */
+#if defined(NV_MODULE_IMPORT_NS_TAKES_CONSTANT)
 MODULE_IMPORT_NS(DMA_BUF);
+#else
+MODULE_IMPORT_NS("DMA_BUF");
+#endif  // defined(NV_MODULE_IMPORT_NS_TAKES_CONSTANT)
 #endif  // defined(MODULE_IMPORT_NS)

 const NvBool nv_is_rm_firmware_supported_os = NV_TRUE;
@@ -182,13 +185,11 @@ static void *nvidia_pte_t_cache;
 void *nvidia_stack_t_cache;
 static nvidia_stack_t *__nv_init_sp;

-static int nv_tce_bypass_mode = NV_TCE_BYPASS_MODE_DEFAULT;
-
 struct semaphore nv_linux_devices_lock;

-// True if all the successfully probed devices support ATS
+// True if at least one of the successfully probed devices support ATS
 // Assigned at device probe (module init) time
-NvBool nv_ats_supported = NV_TRUE;
+NvBool nv_ats_supported;

 // allow an easy way to convert all debug printfs related to events
 // back and forth between 'info' and 'errors'
@@ -589,20 +590,11 @@ nv_registry_keys_init(nv_stack_t *sp)
    NvU32 data;

    /*
-     * Determine the TCE bypass mode here so it can be used during
-     * device probe.  Also determine whether we should allow
-     * user-mode NUMA onlining of device memory.
+     * Determine whether we should allow user-mode NUMA onlining of device
+     * memory.
     */
    if (NVCPU_IS_PPC64LE)
    {
-        status = rm_read_registry_dword(sp, nv,
-                                        NV_REG_TCE_BYPASS_MODE,
-                                        &data);
-        if ((status == NV_OK) && ((int)data != NV_TCE_BYPASS_MODE_DEFAULT))
-        {
-            nv_tce_bypass_mode = data;
-        }
-
        if (NVreg_EnableUserNUMAManagement)
        {
            /* Force on the core RM registry key to match. */
@@ -1349,15 +1341,6 @@ static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)
        power_ref = NV_TRUE;
    }

-    rc = nv_init_ibmnpu_devices(nv);
-    if (rc != 0)
-    {
-        nv_printf(NV_DBG_ERRORS,
-            "NVRM: failed to initialize ibmnpu devices attached to GPU with minor number %d\n",
-            nvl->minor_num);
-        goto failed;
-    }
-
    if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
    {
        rc = nv_dev_alloc_stacks(nvl);
@@ -1558,8 +1541,6 @@ failed:

    nv_dev_free_stacks(nvl);

-    nv_unregister_ibmnpu_devices(nv);
-
    if (power_ref)
    {
        rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
@@ -1975,6 +1956,20 @@ void nv_shutdown_adapter(nvidia_stack_t *sp,

    rm_shutdown_adapter(sp, nv);

+    if (nv->flags & NV_FLAG_TRIGGER_FLR)
+    {
+        if (nvl->pci_dev)
+        {
+            nv_printf(NV_DBG_INFO, "NVRM: Trigger FLR!\n");
+            os_pci_trigger_flr((void *)nvl->pci_dev);
+        }
+        else
+        {
+            nv_printf(NV_DBG_ERRORS, "NVRM: FLR not supported by the device!\n");
+        }
+        nv->flags &= ~NV_FLAG_TRIGGER_FLR;
+    }
+
    if (nv_platform_use_auto_online(nvl))
        nv_kthread_q_stop(&nvl->remove_numa_memory_q);
 }
@@ -2037,8 +2032,6 @@ static void nv_stop_device(nv_state_t *nv, nvidia_stack_t *sp)
    /* leave INIT flag alone so we don't reinit every time */
    nv->flags &= ~NV_FLAG_OPEN;

-    nv_unregister_ibmnpu_devices(nv);
-
    if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
    {
        rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
@@ -3102,17 +3095,10 @@ nv_set_dma_address_size(
 )
 {
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
-    NvU64 start_addr = nv_get_dma_start_address(nv);
    NvU64 new_mask = (((NvU64)1) << phys_addr_bits) - 1;

-    nvl->dma_dev.addressable_range.limit = start_addr + new_mask;
+    nvl->dma_dev.addressable_range.limit = new_mask;

-    /*
-     * The only scenario in which we definitely should not update the DMA mask
-     * is on POWER, when using TCE bypass mode (see nv_get_dma_start_address()
-     * for details), since the meaning of the DMA mask is overloaded in that
-     * case.
-     */
    if (!nvl->tce_bypass_enabled)
    {
        dma_set_mask(&nvl->pci_dev->dev, new_mask);
@@ -3741,35 +3727,6 @@ NV_STATUS NV_API_CALL nv_alloc_pages(
    if (unencrypted)
        at->flags.unencrypted = NV_TRUE;

-#if defined(NVCPU_PPC64LE)
-    /*
-     * Starting on Power9 systems, DMA addresses for NVLink are no longer the
-     * same as used over PCIe. There is an address compression scheme required
-     * for NVLink ONLY which impacts the upper address bits of the DMA address.
-     *
-     * This divergence between PCIe and NVLink DMA mappings breaks assumptions
-     * in the driver where during initialization we allocate system memory
-     * for the GPU to access over PCIe before NVLink is trained -- and some of
-     * these mappings persist on the GPU. If these persistent mappings are not
-     * equivalent they will cause invalid DMA accesses from the GPU once we
-     * switch to NVLink.
-     *
-     * To work around this we limit all system memory allocations from the driver
-     * during the period before NVLink is enabled to be from NUMA node 0 (CPU 0)
-     * which has a CPU real address with the upper address bits (above bit 42)
-     * set to 0. Effectively making the PCIe and NVLink DMA mappings equivalent
-     * allowing persistent system memory mappings already programmed on the GPU
-     * to remain valid after NVLink is enabled.
-     *
-     * See Bug 1920398 for more details.
-     */
-    if (nv && nvl->npu && !nvl->dma_dev.nvlink)
-    {
-        at->flags.node = NV_TRUE;
-        at->node_id = 0;
-    }
-#endif
-
    if (node_id != NUMA_NO_NODE)
    {
        at->flags.node = NV_TRUE;
@@ -4873,154 +4830,6 @@ NV_STATUS NV_API_CALL nv_log_error(
    return status;
 }

-NvU64 NV_API_CALL nv_get_dma_start_address(
-    nv_state_t *nv
-)
-{
-#if defined(NVCPU_PPC64LE)
-    struct pci_dev *pci_dev;
-    dma_addr_t dma_addr;
-    NvU64 saved_dma_mask;
-    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
-
-    /*
-     * If TCE bypass is disabled via a module parameter, then just return
-     * the default (which is 0).
-     *
-     * Otherwise, the DMA start address only needs to be set once, and it
-     * won't change afterward. Just return the cached value if asked again,
-     * to avoid the kernel printing redundant messages to the kernel
-     * log when we call pci_set_dma_mask().
-     */
-    if ((nv_tce_bypass_mode == NV_TCE_BYPASS_MODE_DISABLE) ||
-        (nvl->tce_bypass_enabled))
-    {
-        return nvl->dma_dev.addressable_range.start;
-    }
-
-    pci_dev = nvl->pci_dev;
-
-    /*
-     * Linux on IBM POWER8 offers 2 different DMA set-ups, sometimes
-     * referred to as "windows".
-     *
-     * The "default window" provides a 2GB region of PCI address space
-     * located below the 32-bit line. The IOMMU is used to provide a
-     * "rich" mapping--any page in system memory can be mapped at an
-     * arbitrary address within this window. The mappings are dynamic
-     * and pass in and out of being as pci_map*()/pci_unmap*() calls
-     * are made.
-     *
-     * Dynamic DMA Windows (sometimes "Huge DDW") provides a linear
-     * mapping of the system's entire physical address space at some
-     * fixed offset above the 59-bit line. IOMMU is still used, and
-     * pci_map*()/pci_unmap*() are still required, but mappings are
-     * static. They're effectively set up in advance, and any given
-     * system page will always map to the same PCI bus address. I.e.
-     *   physical 0x00000000xxxxxxxx => PCI 0x08000000xxxxxxxx
-     *
-     * This driver does not support the 2G default window because
-     * of its limited size, and for reasons having to do with UVM.
-     *
-     * Linux on POWER8 will only provide the DDW-style full linear
-     * mapping when the driver claims support for 64-bit DMA addressing
-     * (a pre-requisite because the PCI addresses used in this case will
-     * be near the top of the 64-bit range). The linear mapping
-     * is not available in all system configurations.
-     *
-     * Detect whether the linear mapping is present by claiming
-     * 64-bit support and then mapping physical page 0. For historical
-     * reasons, Linux on POWER8 will never map a page to PCI address 0x0.
-     * In the "default window" case page 0 will be mapped to some
-     * non-zero address below the 32-bit line.  In the
-     * DDW/linear-mapping case, it will be mapped to address 0 plus
-     * some high-order offset.
-     *
-     * If the linear mapping is present and sane then return the offset
-     * as the starting address for all DMA mappings.
-     */
-    saved_dma_mask = pci_dev->dma_mask;
-    if (dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(64)) != 0)
-    {
-        goto done;
-    }
-
-    dma_addr = dma_map_single(&pci_dev->dev, NULL, 1, DMA_BIDIRECTIONAL);
-    if (dma_mapping_error(&pci_dev->dev, dma_addr))
-    {
-        dma_set_mask(&pci_dev->dev, saved_dma_mask);
-        goto done;
-    }
-
-    dma_unmap_single(&pci_dev->dev, dma_addr, 1, DMA_BIDIRECTIONAL);
-
-    /*
-     * From IBM: "For IODA2, native DMA bypass or KVM TCE-based implementation
-     * of full 64-bit DMA support will establish a window in address-space
-     * with the high 14 bits being constant and the bottom up-to-50 bits
-     * varying with the mapping."
-     *
-     * Unfortunately, we don't have any good interfaces or definitions from
-     * the kernel to get information about the DMA offset assigned by OS.
-     * However, we have been told that the offset will be defined by the top
-     * 14 bits of the address, and bits 40-49 will not vary for any DMA
-     * mappings until 1TB of system memory is surpassed; this limitation is
-     * essential for us to function properly since our current GPUs only
-     * support 40 physical address bits. We are in a fragile place where we
-     * need to tell the OS that we're capable of 64-bit addressing, while
-     * relying on the assumption that the top 24 bits will not vary in this
-     * case.
-     *
-     * The way we try to compute the window, then, is mask the trial mapping
-     * against the DMA capabilities of the device. That way, devices with
-     * greater addressing capabilities will only take the bits it needs to
-     * define the window.
-     */
-    if ((dma_addr & DMA_BIT_MASK(32)) != 0)
-    {
-        /*
-         * Huge DDW not available - page 0 mapped to non-zero address below
-         * the 32-bit line.
-         */
-        nv_printf(NV_DBG_WARNINGS,
-            "NVRM: DMA window limited by platform\n");
-        dma_set_mask(&pci_dev->dev, saved_dma_mask);
-        goto done;
-    }
-    else if ((dma_addr & saved_dma_mask) != 0)
-    {
-        NvU64 memory_size = NV_NUM_PHYSPAGES * PAGE_SIZE;
-        if ((dma_addr & ~saved_dma_mask) !=
-            ((dma_addr + memory_size) & ~saved_dma_mask))
-        {
-            /*
-             * The physical window straddles our addressing limit boundary,
-             * e.g., for an adapter that can address up to 1TB, the window
-             * crosses the 40-bit limit so that the lower end of the range
-             * has different bits 63:40 than the higher end of the range.
-             * We can only handle a single, static value for bits 63:40, so
-             * we must fall back here.
-             */
-            nv_printf(NV_DBG_WARNINGS,
-                "NVRM: DMA window limited by memory size\n");
-            dma_set_mask(&pci_dev->dev, saved_dma_mask);
-            goto done;
-        }
-    }
-
-    nvl->tce_bypass_enabled = NV_TRUE;
-    nvl->dma_dev.addressable_range.start = dma_addr & ~(saved_dma_mask);
-
-    /* Update the coherent mask to match */
-    dma_set_coherent_mask(&pci_dev->dev, pci_dev->dma_mask);
-
-done:
-    return nvl->dma_dev.addressable_range.start;
-#else
-    return 0;
-#endif
-}
-
 NV_STATUS NV_API_CALL nv_set_primary_vga_status(
    nv_state_t *nv
 )
@@ -5041,38 +4850,6 @@ NV_STATUS NV_API_CALL nv_set_primary_vga_status(
 #endif
 }

-NV_STATUS NV_API_CALL nv_pci_trigger_recovery(
-     nv_state_t *nv
-)
-{
-    NV_STATUS status = NV_ERR_NOT_SUPPORTED;
-#if defined(NV_PCI_ERROR_RECOVERY)
-    nv_linux_state_t *nvl       = NV_GET_NVL_FROM_NV_STATE(nv);
-
-    /*
-     * Calling readl() on PPC64LE will allow the kernel to check its state for
-     * the device and update it accordingly. This needs to be done before
-     * checking if the PCI channel is offline, so that we don't check stale
-     * state.
-     *
-     * This will also kick off the recovery process for the device.
-     */
-    if (NV_PCI_ERROR_RECOVERY_ENABLED())
-    {
-        if (readl(nv->regs->map) == 0xFFFFFFFF)
-        {
-            if (pci_channel_offline(nvl->pci_dev))
-            {
-                NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
-                              "PCI channel for the device is offline\n");
-                status = NV_OK;
-            }
-        }
-    }
-#endif
-    return status;
-}
-
 NvBool NV_API_CALL nv_requires_dma_remap(
    nv_state_t *nv
 )
@@ -5304,44 +5081,6 @@ NV_STATUS NV_API_CALL nv_get_device_memory_config(
 {
    NV_STATUS status = NV_ERR_NOT_SUPPORTED;

-#if defined(NVCPU_PPC64LE)
-    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
-
-    if (!nv_platform_supports_numa(nvl))
-    {
-        return NV_ERR_NOT_SUPPORTED;
-    }
-
-    if (node_id != NULL)
-    {
-        *node_id = nvl->numa_info.node_id;
-    }
-
-    {
-        nv_npu_numa_info_t *numa_info;
-
-        numa_info = &nvl->npu->numa_info;
-
-        if (compr_addr_sys_phys != NULL)
-        {
-            *compr_addr_sys_phys =
-                numa_info->compr_sys_phys_addr;
-        }
-
-        if (addr_guest_phys != NULL)
-        {
-            *addr_guest_phys =
-                numa_info->guest_phys_addr;
-        }
-    }
-
-    if (addr_width != NULL)
-    {
-        *addr_width = nv_volta_dma_addr_size - nv_volta_addr_space_width;
-    }
-
-    status = NV_OK;
-#endif
 #if defined(NVCPU_AARCH64)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

@@ -5374,68 +5113,6 @@ NV_STATUS NV_API_CALL nv_get_device_memory_config(
    return status;
 }

-#if defined(NVCPU_PPC64LE)
-
-NV_STATUS NV_API_CALL nv_get_nvlink_line_rate(
-    nv_state_t *nvState,
-    NvU32      *linerate
-)
-{
-#if defined(NV_PNV_PCI_GET_NPU_DEV_PRESENT)
-
-    nv_linux_state_t *nvl;
-    struct pci_dev   *npuDev;
-    NvU32            *pSpeedPtr = NULL;
-    NvU32            speed;
-    int              len;
-
-    if (nvState != NULL)
-        nvl = NV_GET_NVL_FROM_NV_STATE(nvState);
-    else
-        return NV_ERR_INVALID_ARGUMENT;
-
-    if (!nvl->npu)
-    {
-        return NV_ERR_NOT_SUPPORTED;
-    }
-
-    npuDev = nvl->npu->devs[0];
-    if (!npuDev->dev.of_node)
-    {
-        nv_printf(NV_DBG_ERRORS, "NVRM: %s: OF Node not found in IBM-NPU device node\n",
-                  __FUNCTION__);
-        return NV_ERR_NOT_SUPPORTED;
-    }
-
-    pSpeedPtr = (NvU32 *) of_get_property(npuDev->dev.of_node, "ibm,nvlink-speed", &len);
-
-    if (pSpeedPtr)
-    {
-        speed = (NvU32) be32_to_cpup(pSpeedPtr);
-    }
-    else
-    {
-        return NV_ERR_NOT_SUPPORTED;
-    }
-
-    if (!speed)
-    {
-        return NV_ERR_NOT_SUPPORTED;
-    }
-    else
-    {
-        *linerate = speed;
-    }
-
-    return NV_OK;
-
-#endif
-
-    return NV_ERR_NOT_SUPPORTED;
-}
-
-#endif
-
 NV_STATUS NV_API_CALL nv_indicate_idle(
    nv_state_t *nv
 )
@@ -5972,9 +5649,7 @@ void NV_API_CALL nv_disallow_runtime_suspend

 void NV_API_CALL nv_flush_coherent_cpu_cache_range(nv_state_t *nv, NvU64 cpu_virtual, NvU64 size)
 {
-#if NVCPU_IS_PPC64LE
-    return nv_ibmnpu_cache_flush_range(nv, cpu_virtual, size);
-#elif NVCPU_IS_AARCH64
+#if NVCPU_IS_AARCH64
    NvU64 va, cbsize;
    NvU64 end_cpu_virtual = cpu_virtual + size;
    
@@ -5983,8 +5658,6 @@ void NV_API_CALL nv_flush_coherent_cpu_cache_range(nv_state_t *nv, NvU64 cpu_vir
            cpu_virtual, end_cpu_virtual);

    cbsize = cache_line_size();
-    // Align address to line size
-    cpu_virtual = NV_ALIGN_UP(cpu_virtual, cbsize);

    // Force eviction of any cache lines from the NUMA-onlined region.
    for (va = cpu_virtual; va < end_cpu_virtual; va += cbsize)
@@ -6147,7 +5820,7 @@ void NV_API_CALL nv_get_screen_info(
                *pFbHeight = registered_fb[i]->var.yres;
                *pFbDepth = registered_fb[i]->var.bits_per_pixel;
                *pFbPitch = registered_fb[i]->fix.line_length;
-                *pFbSize = (NvU64)(*pFbHeight) * (NvU64)(*pFbPitch);
+                *pFbSize = registered_fb[i]->fix.smem_len;
                return;
            }
        }