From 443ace971f637844ec8b9f8cb68cd34f82d3435a Mon Sep 17 00:00:00 2001 From: Maneet Singh Date: Tue, 17 Jun 2025 11:52:54 -0700 Subject: [PATCH] 570.158.01 --- README.md | 15 +++- kernel-open/Kbuild | 2 +- kernel-open/conftest.sh | 12 +-- kernel-open/nvidia-uvm/uvm_ats_faults.c | 16 +++- kernel-open/nvidia/nv-caps.c | 29 ++++++- kernel-open/nvidia/nvidia.Kbuild | 2 +- kernel-open/nvidia/os-interface.c | 21 ++--- src/common/inc/nvBldVer.h | 20 ++--- src/common/inc/nvUnixVersion.h | 2 +- .../nvidia/inc/ctrl/ctrl2080/ctrl2080gpu.h | 9 +- .../inc/ctrl/ctrl2080/ctrl2080internal.h | 2 +- src/common/sdk/nvidia/inc/nverror.h | 13 ++- src/common/shared/inc/nvdevid.h | 5 ++ src/nvidia/generated/g_gsync_nvoc.h | 6 +- src/nvidia/generated/g_nv_name_released.h | 9 ++ .../kernel/gpu/external_device/dac_p2060.h | 4 +- .../kernel/gpu/external_device/dac_p2061.h | 4 +- .../phys_mem_allocator_util.h | 6 +- src/nvidia/inc/kernel/rmapi/rmapi_utils.h | 2 +- src/nvidia/interface/nvrm_registry.h | 10 +++ .../arch/kepler/kern_gsync_p2060.c | 86 +++++++++++++++++-- .../arch/pascal/kern_gsync_p2061.c | 15 ++-- .../src/kernel/gpu/external_device/gsync.c | 23 ++--- .../gsp/arch/blackwell/kernel_gsp_ecc_gb100.c | 1 + .../phys_mem_allocator/phys_mem_allocator.c | 4 +- .../phys_mem_allocator_util.c | 43 ++++------ .../gpu/mig_mgr/arch/ampere/kmigmgr_ga100.c | 5 ++ .../mig_mgr/arch/blackwell/kmigmgr_gb100.c | 5 ++ .../mig_mgr/arch/blackwell/kmigmgr_gb10b.c | 5 ++ .../mig_mgr/arch/blackwell/kmigmgr_gb202.c | 15 ++++ .../gpu/mig_mgr/arch/hopper/kmigmgr_gh100.c | 6 ++ .../gpu/mig_mgr/gpu_instance_subscription.c | 2 +- src/nvidia/src/kernel/rmapi/control.c | 2 +- src/nvidia/src/kernel/rmapi/rmapi_utils.c | 7 +- src/nvidia/src/kernel/vgpu/vgpu_util.c | 4 +- version.mk | 2 +- 36 files changed, 302 insertions(+), 112 deletions(-) diff --git a/README.md b/README.md index d01e78410..e89fa1f1a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # NVIDIA Linux Open GPU Kernel Module Source This is the source release of the NVIDIA Linux open GPU kernel modules, -version 570.153.02. +version 570.158.01. ## How to Build @@ -17,7 +17,7 @@ as root: Note that the kernel modules built here must be used with GSP firmware and user-space NVIDIA GPU driver components from a corresponding -570.153.02 driver release. This can be achieved by installing +570.158.01 driver release. This can be achieved by installing the NVIDIA GPU driver from the .run file using the `--no-kernel-modules` option. E.g., @@ -185,7 +185,7 @@ table below). For details on feature support and limitations, see the NVIDIA GPU driver end user README here: -https://us.download.nvidia.com/XFree86/Linux-x86_64/570.153.02/README/kernel_open.html +https://us.download.nvidia.com/XFree86/Linux-x86_64/570.158.01/README/kernel_open.html For vGPU support, please refer to the README.vgpu packaged in the vGPU Host Package for more details. @@ -966,11 +966,20 @@ Subsystem Device ID. | NVIDIA GeForce RTX 5070 Ti | 2C05 | | NVIDIA GeForce RTX 5090 Laptop GPU | 2C18 | | NVIDIA GeForce RTX 5080 Laptop GPU | 2C19 | +| NVIDIA RTX PRO 5000 Blackwell Generation Laptop GPU | 2C38 | +| NVIDIA RTX PRO 4000 Blackwell Generation Laptop GPU | 2C39 | | NVIDIA GeForce RTX 5090 Laptop GPU | 2C58 | | NVIDIA GeForce RTX 5080 Laptop GPU | 2C59 | | NVIDIA GeForce RTX 5060 Ti | 2D04 | +| NVIDIA GeForce RTX 5060 | 2D05 | | NVIDIA GeForce RTX 5070 Laptop GPU | 2D18 | +| NVIDIA GeForce RTX 5060 Laptop GPU | 2D19 | +| NVIDIA RTX PRO 2000 Blackwell Generation Laptop GPU | 2D39 | | NVIDIA GeForce RTX 5070 Laptop GPU | 2D58 | +| NVIDIA GeForce RTX 5060 Laptop GPU | 2D59 | +| NVIDIA RTX PRO 1000 Blackwell Generation Laptop GPU | 2DB8 | +| NVIDIA RTX PRO 500 Blackwell Generation Laptop GPU | 2DB9 | | NVIDIA GeForce RTX 5070 | 2F04 | | NVIDIA GeForce RTX 5070 Ti Laptop GPU | 2F18 | +| NVIDIA RTX PRO 3000 Blackwell Generation Laptop GPU | 2F38 | | NVIDIA GeForce RTX 5070 Ti Laptop GPU | 2F58 | diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild index 7961d16c9..0ebc9e4f9 100644 --- a/kernel-open/Kbuild +++ b/kernel-open/Kbuild @@ -79,7 +79,7 @@ ccflags-y += -I$(src)/common/inc ccflags-y += -I$(src) ccflags-y += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args ccflags-y += -D__KERNEL__ -DMODULE -DNVRM -ccflags-y += -DNV_VERSION_STRING=\"570.153.02\" +ccflags-y += -DNV_VERSION_STRING=\"570.158.01\" ifneq ($(SYSSRCHOST1X),) ccflags-y += -I$(SYSSRCHOST1X) diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh index 50070eb98..7193ffda0 100755 --- a/kernel-open/conftest.sh +++ b/kernel-open/conftest.sh @@ -6602,22 +6602,22 @@ compile_test() { compile_check_conftest "$CODE" "NV_DRM_DRIVER_HAS_DUMB_DESTROY" "" "types" ;; - memory_failure_has_trapno_arg) + memory_failure_queue_has_trapno_arg) # - # Check if memory_failure() has trapno parameter. + # Check if memory_failure_queue() has trapno parameter. # # Removed by commit 83b57531c58f ("mm/memory_failure: Remove # unused trapno from memory_failure") in v4.16. # CODE=" #include - void conftest_memory_failure_has_trapno_arg(unsigned long pfn, + void conftest_memory_failure_queue_has_trapno_arg(unsigned long pfn, int trapno, int flags) { - (void) memory_failure(pfn, trapno, flags); + memory_failure_queue(pfn, trapno, flags); }" - compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_HAS_TRAPNO_ARG" "" "types" + compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_QUEUE_HAS_TRAPNO_ARG" "" "types" ;; memory_failure_mf_sw_simulated_defined) @@ -7571,7 +7571,7 @@ compile_test() { CODE=" #include int conftest_page_pgmap(void) { - return page_pgmap(NULL); + return page_pgmap(); }" compile_check_conftest "$CODE" "NV_PAGE_PGMAP_PRESENT" "" "functions" diff --git a/kernel-open/nvidia-uvm/uvm_ats_faults.c b/kernel-open/nvidia-uvm/uvm_ats_faults.c index 6f650760b..4c8acb4cb 100644 --- a/kernel-open/nvidia-uvm/uvm_ats_faults.c +++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c @@ -767,6 +767,20 @@ NV_STATUS uvm_ats_service_access_counters(uvm_gpu_va_space_t *gpu_va_space, &ats_context->access_counters.accessed_mask, &ats_context->prefetch_state.residency_mask); + // Pretend that pages that are already resident at the destination GPU were + // migrated now. This makes sure that the access counter is cleared even if + // the accessed pages, were already resident on the target. + // TODO: Bug 5296998: [uvm][ats] Not clearing stale access counter + // notifications can lead to missed migrations + // The same problem of stale notification exists for migration to other + // locations than local vidmem. However, stale notifications to data + // migrated to another remote location are identical to those triggered + // by accessing memory that cannot or should not be migrated. + if (uvm_id_equal(ats_context->residency_id, gpu_va_space->gpu->id)) { + uvm_page_mask_copy(&ats_context->access_counters.migrated_mask, + &ats_context->prefetch_state.residency_mask); + } + for_each_va_block_subregion_in_mask(subregion, &ats_context->access_counters.accessed_mask, region) { NV_STATUS status; NvU64 start = base + (subregion.first * PAGE_SIZE); @@ -779,7 +793,7 @@ NV_STATUS uvm_ats_service_access_counters(uvm_gpu_va_space_t *gpu_va_space, status = service_ats_requests(gpu_va_space, vma, start, length, access_type, service_type, ats_context); - // clear access counters if pages were migrated or migration needs to + // Clear access counters if pages were migrated or migration needs to // be retried if (status == NV_OK || status == NV_ERR_BUSY_RETRY) uvm_page_mask_region_fill(migrated_mask, subregion); diff --git a/kernel-open/nvidia/nv-caps.c b/kernel-open/nvidia/nv-caps.c index 391f31aba..6becad959 100644 --- a/kernel-open/nvidia/nv-caps.c +++ b/kernel-open/nvidia/nv-caps.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -582,6 +582,9 @@ err: void NV_API_CALL nv_cap_close_fd(int fd) { #if NV_FILESYSTEM_ACCESS_AVAILABLE + struct file *file; + NvBool is_nv_cap_fd; + if (fd == -1) { return; @@ -600,6 +603,30 @@ void NV_API_CALL nv_cap_close_fd(int fd) return; } + file = fget(fd); + if (file == NULL) + { + task_unlock(current); + return; + } + + /* Make sure the fd belongs to the nv-cap-drv */ + is_nv_cap_fd = (file->f_op == &g_nv_cap_drv_fops); + + fput(file); + + /* + * In some cases, we may be in shutdown path and execute + * in context of unrelated process. In that case we should + * not access any 'current' state, but instead let kernel + * clean up capability files on its own. + */ + if (!is_nv_cap_fd) + { + task_unlock(current); + return; + } + /* * From v4.17-rc1 (to v5.10.8) kernels have stopped exporting sys_close(fd) * and started exporting __close_fd, as of this commit: diff --git a/kernel-open/nvidia/nvidia.Kbuild b/kernel-open/nvidia/nvidia.Kbuild index eef07bcbb..db4f61a46 100644 --- a/kernel-open/nvidia/nvidia.Kbuild +++ b/kernel-open/nvidia/nvidia.Kbuild @@ -257,7 +257,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += add_memory_driver_managed_has_mhp_flags_arg NV_CONFTEST_TYPE_COMPILE_TESTS += num_registered_fb NV_CONFTEST_TYPE_COMPILE_TESTS += pci_driver_has_driver_managed_dma NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags -NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_has_trapno_arg +NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_queue_has_trapno_arg NV_CONFTEST_TYPE_COMPILE_TESTS += foll_longterm_present NV_CONFTEST_TYPE_COMPILE_TESTS += bus_type_has_iommu_ops NV_CONFTEST_TYPE_COMPILE_TESTS += class_create_has_no_owner_arg diff --git a/kernel-open/nvidia/os-interface.c b/kernel-open/nvidia/os-interface.c index 20e6e6826..24669d310 100644 --- a/kernel-open/nvidia/os-interface.c +++ b/kernel-open/nvidia/os-interface.c @@ -2596,7 +2596,6 @@ NV_STATUS NV_API_CALL os_offline_page_at_address { #if defined(CONFIG_MEMORY_FAILURE) int flags = 0; - int ret; NvU64 pfn; struct page *page = NV_GET_PAGE_STRUCT(address); @@ -2621,22 +2620,18 @@ NV_STATUS NV_API_CALL os_offline_page_at_address flags |= MF_SW_SIMULATED; #endif -#ifdef NV_MEMORY_FAILURE_HAS_TRAPNO_ARG - ret = memory_failure(pfn, 0, flags); -#else - ret = memory_failure(pfn, flags); -#endif + nv_printf(NV_DBG_INFO, "NVRM: offlining page at address: 0x%llx pfn: 0x%llx\n", + address, pfn); - if (ret != 0) - { - nv_printf(NV_DBG_ERRORS, "NVRM: page offlining failed. address: 0x%llx pfn: 0x%llx ret: %d\n", - address, pfn, ret); - return NV_ERR_OPERATING_SYSTEM; - } +#ifdef NV_MEMORY_FAILURE_QUEUE_HAS_TRAPNO_ARG + memory_failure_queue(pfn, 0, flags); +#else + memory_failure_queue(pfn, flags); +#endif return NV_OK; #else // !defined(CONFIG_MEMORY_FAILURE) - nv_printf(NV_DBG_ERRORS, "NVRM: memory_failure() not supported by kernel. page offlining failed. address: 0x%llx\n", + nv_printf(NV_DBG_ERRORS, "NVRM: memory_failure_queue() not supported by kernel. page offlining failed. address: 0x%llx\n", address); return NV_ERR_NOT_SUPPORTED; #endif diff --git a/src/common/inc/nvBldVer.h b/src/common/inc/nvBldVer.h index 0722876e2..72d0c4105 100644 --- a/src/common/inc/nvBldVer.h +++ b/src/common/inc/nvBldVer.h @@ -36,25 +36,25 @@ // and then checked back in. You cannot make changes to these sections without // corresponding changes to the buildmeister script #ifndef NV_BUILD_BRANCH - #define NV_BUILD_BRANCH r573_24 + #define NV_BUILD_BRANCH r573_30 #endif #ifndef NV_PUBLIC_BRANCH - #define NV_PUBLIC_BRANCH r573_24 + #define NV_PUBLIC_BRANCH r573_30 #endif #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) -#define NV_BUILD_BRANCH_VERSION "rel/gpu_drv/r570/r573_24-444" -#define NV_BUILD_CHANGELIST_NUM (35974374) +#define NV_BUILD_BRANCH_VERSION "rel/gpu_drv/r570/r573_30-464" +#define NV_BUILD_CHANGELIST_NUM (36065453) #define NV_BUILD_TYPE "Official" -#define NV_BUILD_NAME "rel/gpu_drv/r570/r573_24-444" -#define NV_LAST_OFFICIAL_CHANGELIST_NUM (35974374) +#define NV_BUILD_NAME "rel/gpu_drv/r570/r573_30-464" +#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36065453) #else /* Windows builds */ -#define NV_BUILD_BRANCH_VERSION "r573_24-1" -#define NV_BUILD_CHANGELIST_NUM (35972701) +#define NV_BUILD_BRANCH_VERSION "r573_30-1" +#define NV_BUILD_CHANGELIST_NUM (36065453) #define NV_BUILD_TYPE "Official" -#define NV_BUILD_NAME "573.25" -#define NV_LAST_OFFICIAL_CHANGELIST_NUM (35972701) +#define NV_BUILD_NAME "573.32" +#define NV_LAST_OFFICIAL_CHANGELIST_NUM (36065453) #define NV_BUILD_BRANCH_BASE_VERSION R570 #endif // End buildmeister python edited section diff --git a/src/common/inc/nvUnixVersion.h b/src/common/inc/nvUnixVersion.h index 1ab83b8dc..5bd42c3ff 100644 --- a/src/common/inc/nvUnixVersion.h +++ b/src/common/inc/nvUnixVersion.h @@ -4,7 +4,7 @@ #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \ (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1) -#define NV_VERSION_STRING "570.153.02" +#define NV_VERSION_STRING "570.158.01" #else diff --git a/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080gpu.h b/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080gpu.h index d702191bc..9649232e8 100644 --- a/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080gpu.h +++ b/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080gpu.h @@ -2634,7 +2634,14 @@ typedef struct NV2080_CTRL_GPU_SET_PARTITION_INFO { #define NV2080_CTRL_GPU_PARTITION_FLAG_GFX_SIZE__SIZE 8U -#define NV2080_CTRL_GPU_PARTITION_MAX_TYPES 40U +#define NV2080_CTRL_GPU_PARTITION_MAX_TYPES 90U + +#define NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA 29:28 +#define NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA_DEFAULT 0U +#define NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA_DISABLE 1U +#define NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA_ENABLE 2U + + #define NV2080_CTRL_GPU_PARTITION_FLAG_REQ_DEC_JPG_OFA 30:30 #define NV2080_CTRL_GPU_PARTITION_FLAG_REQ_DEC_JPG_OFA_DISABLE 0U #define NV2080_CTRL_GPU_PARTITION_FLAG_REQ_DEC_JPG_OFA_ENABLE 1U diff --git a/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080internal.h b/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080internal.h index 27e641c82..cde9644e9 100644 --- a/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080internal.h +++ b/src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080internal.h @@ -1003,7 +1003,7 @@ typedef struct NV2080_CTRL_INTERNAL_DISPLAY_SETUP_RG_LINE_INTR_PARAMS { * validGfxCTSIdMask [OUT] * # mask of CTS IDs that contain Gfx capable Grs which can be assigned under this profile */ -#define NV2080_CTRL_INTERNAL_GRMGR_PARTITION_MAX_TYPES 60 +#define NV2080_CTRL_INTERNAL_GRMGR_PARTITION_MAX_TYPES 90 diff --git a/src/common/sdk/nvidia/inc/nverror.h b/src/common/sdk/nvidia/inc/nverror.h index 1d6e83ba3..79167d110 100644 --- a/src/common/sdk/nvidia/inc/nverror.h +++ b/src/common/sdk/nvidia/inc/nverror.h @@ -153,7 +153,18 @@ #define RESOURCE_RETIREMENT_FAILURE (157) #define CHANNEL_RETIREMENT_EVENT (160) #define CHANNEL_RETIREMENT_FAILURE (161) -#define ROBUST_CHANNEL_LAST_ERROR (161) +#define ROBUST_CHANNEL_UNUSED_ERROR_162 (162) +#define ROBUST_CHANNEL_UNUSED_ERROR_163 (163) +#define ROBUST_CHANNEL_UNUSED_ERROR_164 (164) +#define ROBUST_CHANNEL_UNUSED_ERROR_165 (165) +#define ROBUST_CHANNEL_UNUSED_ERROR_166 (166) +#define ROBUST_CHANNEL_UNUSED_ERROR_167 (167) +#define ROBUST_CHANNEL_UNUSED_ERROR_168 (168) +#define ROBUST_CHANNEL_UNUSED_ERROR_169 (169) +#define ROBUST_CHANNEL_UNUSED_ERROR_170 (170) +#define UNCORRECTABLE_DRAM_ERROR (171) +#define UNCORRECTABLE_SRAM_ERROR (172) +#define ROBUST_CHANNEL_LAST_ERROR (172) // Indexed CE reference #define ROBUST_CHANNEL_CE_ERROR(x) \ diff --git a/src/common/shared/inc/nvdevid.h b/src/common/shared/inc/nvdevid.h index d04a439e4..9ebe0d946 100644 --- a/src/common/shared/inc/nvdevid.h +++ b/src/common/shared/inc/nvdevid.h @@ -172,6 +172,11 @@ // A16 #define NV_PCI_SUBID_DEVICE_PG171_SKU200 0x14A9 +// NVIDIA B200 +#define NV_PCI_SUBID_DEVICE_PG525_SKU220 0x1999 +#define NV_PCI_SUBID_DEVICE_PG525_SKU225 0x199B +#define NV_PCI_SUBID_DEVICE_PG525_SKU230 0x20DA + /////////////////////////////////////////////////////////////////////////////////////////// // // CHIPSET IDs diff --git a/src/nvidia/generated/g_gsync_nvoc.h b/src/nvidia/generated/g_gsync_nvoc.h index a2558cdde..1a702127f 100644 --- a/src/nvidia/generated/g_gsync_nvoc.h +++ b/src/nvidia/generated/g_gsync_nvoc.h @@ -14,7 +14,7 @@ extern "C" { #endif /* - * SPDX-FileCopyrightText: Copyright (c) 2006-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2006-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -140,7 +140,7 @@ typedef NV_STATUS GsyncSetInterlaceMode (struct OBJGPU *, PDACEXTERNALDEVICE, N typedef NV_STATUS GsyncRefSwapBarrier (struct OBJGPU *, PDACEXTERNALDEVICE, REFTYPE, NvBool *); typedef NV_STATUS GsyncRefSignal (struct OBJGPU *, PDACEXTERNALDEVICE, REFTYPE, GSYNCSYNCSIGNAL, NvBool TestRate, NvU32 *); typedef NV_STATUS GsyncRefMaster (struct OBJGPU *, OBJGSYNC *, REFTYPE, NvU32 *, NvU32 *, NvBool, NvBool); -typedef NV_STATUS GsyncRefSlaves (struct OBJGPU *, PDACEXTERNALDEVICE, REFTYPE, NvU32 *, NvU32 *); +typedef NV_STATUS GsyncRefSlaves (struct OBJGPU *, OBJGSYNC *, REFTYPE, NvU32 *, NvU32 *); typedef NV_STATUS GsyncGetCplStatus (struct OBJGPU *, PDACEXTERNALDEVICE, GSYNCSTATUS, NvU32 *); typedef NV_STATUS GsyncSetWatchdog (struct OBJGPU *, PDACEXTERNALDEVICE, NvU32); typedef NV_STATUS GsyncGetRevision (struct OBJGPU *, OBJGSYNC *, GSYNCCAPSPARAMS *); @@ -154,7 +154,7 @@ typedef NV_STATUS GsyncGetHouseSyncMode (struct OBJGPU *, PDACEXTERNALDEVICE, N typedef NV_STATUS GsyncSetHouseSyncMode (struct OBJGPU *, PDACEXTERNALDEVICE, NvU8); typedef NV_STATUS GsyncGetMulDiv (struct OBJGPU *, DACEXTERNALDEVICE *, NV30F1_CTRL_GSYNC_MULTIPLY_DIVIDE_SETTINGS *); typedef NV_STATUS GsyncSetMulDiv (struct OBJGPU *, DACEXTERNALDEVICE *, NV30F1_CTRL_GSYNC_MULTIPLY_DIVIDE_SETTINGS *); -typedef NV_STATUS GsyncSetRasterSyncDecodeMode (struct OBJGPU *, DACEXTERNALDEVICE *); +typedef NV_STATUS GsyncSetRasterSyncDecodeMode (struct OBJGPU *, struct OBJGPU *, DACEXTERNALDEVICE *); typedef struct GSYNC_HAL_IFACES { diff --git a/src/nvidia/generated/g_nv_name_released.h b/src/nvidia/generated/g_nv_name_released.h index d1ab8103b..32379d679 100644 --- a/src/nvidia/generated/g_nv_name_released.h +++ b/src/nvidia/generated/g_nv_name_released.h @@ -5432,13 +5432,22 @@ static const CHIPS_RELEASED sChipsReleased[] = { { 0x2C05, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Ti" }, { 0x2C18, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090 Laptop GPU" }, { 0x2C19, 0x0000, 0x0000, "NVIDIA GeForce RTX 5080 Laptop GPU" }, + { 0x2C38, 0x0000, 0x0000, "NVIDIA RTX PRO 5000 Blackwell Generation Laptop GPU" }, + { 0x2C39, 0x0000, 0x0000, "NVIDIA RTX PRO 4000 Blackwell Generation Laptop GPU" }, { 0x2C58, 0x0000, 0x0000, "NVIDIA GeForce RTX 5090 Laptop GPU" }, { 0x2C59, 0x0000, 0x0000, "NVIDIA GeForce RTX 5080 Laptop GPU" }, { 0x2D04, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Ti" }, + { 0x2D05, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060" }, { 0x2D18, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Laptop GPU" }, + { 0x2D19, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Laptop GPU" }, + { 0x2D39, 0x0000, 0x0000, "NVIDIA RTX PRO 2000 Blackwell Generation Laptop GPU" }, { 0x2D58, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Laptop GPU" }, + { 0x2D59, 0x0000, 0x0000, "NVIDIA GeForce RTX 5060 Laptop GPU" }, + { 0x2DB8, 0x0000, 0x0000, "NVIDIA RTX PRO 1000 Blackwell Generation Laptop GPU" }, + { 0x2DB9, 0x0000, 0x0000, "NVIDIA RTX PRO 500 Blackwell Generation Laptop GPU" }, { 0x2F04, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070" }, { 0x2F18, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Ti Laptop GPU" }, + { 0x2F38, 0x0000, 0x0000, "NVIDIA RTX PRO 3000 Blackwell Generation Laptop GPU" }, { 0x2F58, 0x0000, 0x0000, "NVIDIA GeForce RTX 5070 Ti Laptop GPU" }, { 0x13BD, 0x11cc, 0x10DE, "GRID M10-0B" }, { 0x13BD, 0x11cd, 0x10DE, "GRID M10-1B" }, diff --git a/src/nvidia/inc/kernel/gpu/external_device/dac_p2060.h b/src/nvidia/inc/kernel/gpu/external_device/dac_p2060.h index fc1b2bd0d..a687df534 100644 --- a/src/nvidia/inc/kernel/gpu/external_device/dac_p2060.h +++ b/src/nvidia/inc/kernel/gpu/external_device/dac_p2060.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -238,7 +238,7 @@ NV_STATUS gsyncRefSignal_P2060 (OBJGPU *, PDACEXTERNALDEVICE, REFT NV_STATUS gsyncRefMaster_P2060 (OBJGPU *, OBJGSYNC *, REFTYPE, NvU32 *DisplayMask, NvU32 *Refresh, NvBool retainMaster, NvBool skipSwapBarrierWar); -NV_STATUS gsyncRefSlaves_P2060 (OBJGPU *, PDACEXTERNALDEVICE, REFTYPE, NvU32 *DisplayMask_s, NvU32 *Refresh); +NV_STATUS gsyncRefSlaves_P2060 (OBJGPU *, OBJGSYNC *, REFTYPE, NvU32 *DisplayMask_s, NvU32 *Refresh); NV_STATUS gsyncGetCplStatus_P2060 (OBJGPU *, PDACEXTERNALDEVICE, GSYNCSTATUS, NvU32 *); NV_STATUS gsyncGetEmitTestSignal_P2060 (OBJGPU *, PDACEXTERNALDEVICE, NvU32 *); NV_STATUS gsyncSetEmitTestSignal_P2060 (OBJGPU *, PDACEXTERNALDEVICE, NvU32); diff --git a/src/nvidia/inc/kernel/gpu/external_device/dac_p2061.h b/src/nvidia/inc/kernel/gpu/external_device/dac_p2061.h index eff75919c..dbfa1b7f7 100644 --- a/src/nvidia/inc/kernel/gpu/external_device/dac_p2061.h +++ b/src/nvidia/inc/kernel/gpu/external_device/dac_p2061.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -33,6 +33,6 @@ NV_STATUS gsyncSetHouseSyncMode_P2061(OBJGPU *, DACEXTERNALDEVICE *, NvU8); NV_STATUS gsyncGetCplStatus_P2061 (OBJGPU *, DACEXTERNALDEVICE *, GSYNCSTATUS, NvU32 *); NV_STATUS gsyncSetSyncSkew_P2061_V204(OBJGPU *, DACEXTERNALDEVICE *, NvU32); NV_STATUS gsyncGetSyncSkew_P2061_V204(OBJGPU *, DACEXTERNALDEVICE *, NvU32 *); -NV_STATUS gsyncSetRasterSyncDecodeMode_P2061_V300(OBJGPU *, DACEXTERNALDEVICE *); +NV_STATUS gsyncSetRasterSyncDecodeMode_P2061_V300(OBJGPU *, OBJGPU *, DACEXTERNALDEVICE *); #endif // DAC_P2061_H diff --git a/src/nvidia/inc/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.h b/src/nvidia/inc/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.h index e9998c780..3de53e6a8 100644 --- a/src/nvidia/inc/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.h +++ b/src/nvidia/inc/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.h @@ -183,16 +183,12 @@ void pmaFreeList(PMA *pPma, PRANGELISTTYPE *ppList); * @param[in] physAddrBase The base address of this address tree * @param[in] pBlacklistPageBase Structure that contains the blacklisted pages * @param[in] blacklistCount Number of blacklisted pages - * @param[in] bBlacklistFromInforom Whether the blacklisted pages are coming from - * inforom (i.e., from heap/PMA init) or not - * (i.e., from ECC interrupt handling) * * @return NV_OK * NV_ERR_NO_MEMORY if memory allocation fails */ NV_STATUS pmaRegisterBlacklistInfo(PMA *pPma, NvU64 physAddrBase, - PPMA_BLACKLIST_ADDRESS pBlacklistPageBase, NvU32 blacklistCount, - NvBool bBlacklistFromInforom); + PPMA_BLACKLIST_ADDRESS pBlacklistPageBase, NvU32 blacklistCount); /*! * @brief Query blacklisting states tracked by PMA diff --git a/src/nvidia/inc/kernel/rmapi/rmapi_utils.h b/src/nvidia/inc/kernel/rmapi/rmapi_utils.h index 808170a04..e211e8b04 100644 --- a/src/nvidia/inc/kernel/rmapi/rmapi_utils.h +++ b/src/nvidia/inc/kernel/rmapi/rmapi_utils.h @@ -63,6 +63,6 @@ NvBool rmapiutilIsExternalClassIdInternalOnly(NvU32 externalClassId); NV_STATUS rmapiutilGetControlInfo(NvU32 cmd, NvU32 *pFlags, NvU32 *pAccessRight, NvU32 *pParamsSize); -NvBool rmapiutilSkipErrorMessageForUnsupportedVgpuGuestControl(NvU32 cmd); +NvBool rmapiutilSkipErrorMessageForUnsupportedVgpuGuestControl(OBJGPU *pGpu, NvU32 cmd); #endif /* RMAPI_UTILS_H */ diff --git a/src/nvidia/interface/nvrm_registry.h b/src/nvidia/interface/nvrm_registry.h index 30cb24120..5959d7dcc 100644 --- a/src/nvidia/interface/nvrm_registry.h +++ b/src/nvidia/interface/nvrm_registry.h @@ -2719,4 +2719,14 @@ #define NV_REG_STR_RM_FB_SANITY_CHECK_DISABLE (0x00000000) #define NV_REG_STR_RM_FB_SANITY_CHECK_DEFAULT NV_REG_STR_RM_FB_SANITY_CHECK_DISABLE +// +// Type DWORD +// Enable GR debug dump for CTXSW timeouts. +// +#define NV_REG_STR_RM_CTXSW_TIMEOUT_DEBUG_DUMP "RmCtxswTimeoutDebugDump" +#define NV_REG_STR_RM_CTXSW_TIMEOUT_DEBUG_DUMP_VAL 0:0 +#define NV_REG_STR_RM_CTXSW_TIMEOUT_DEBUG_DUMP_VAL_ENABLE 0x1 +#define NV_REG_STR_RM_CTXSW_TIMEOUT_DEBUG_DUMP_VAL_DISABLE 0x0 +#define NV_REG_STR_RM_CTXSW_TIMEOUT_DEBUG_DUMP_VAL_DEFAULT NV_REG_STR_RM_CTXSW_TIMEOUT_DEBUG_DUMP_VAL_DISABLE + #endif // NVRM_REGISTRY_H diff --git a/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c b/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c index d13ae3489..3d8d9fb7a 100644 --- a/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c +++ b/src/nvidia/src/kernel/gpu/external_device/arch/kepler/kern_gsync_p2060.c @@ -53,7 +53,7 @@ static void gsyncProgramFramelockEnable_P2060(OBJGPU *, PDACP2060EXTERNALD static NvBool gsyncIsStereoEnabled_p2060 (OBJGPU *, PDACEXTERNALDEVICE); static NV_STATUS gsyncProgramExtStereoPolarity_P2060 (OBJGPU *, PDACEXTERNALDEVICE); -static NV_STATUS gsyncProgramSlaves_P2060(OBJGPU *, PDACP2060EXTERNALDEVICE, NvU32); +static NV_STATUS gsyncProgramSlaves_P2060(OBJGPU *, OBJGSYNC *, NvU32); static NvU32 gsyncReadSlaves_P2060(OBJGPU *, PDACP2060EXTERNALDEVICE); static NV_STATUS gsyncProgramMaster_P2060(OBJGPU *, OBJGSYNC *, NvU32, NvBool, NvBool); static NvU32 gsyncReadMaster_P2060(OBJGPU *, PDACP2060EXTERNALDEVICE); @@ -2444,9 +2444,10 @@ gsyncProgramMaster_P2060 // // Set the RasterSync Decode Mode // This may return an error if the FW and GPU combination is invalid + // In this case, the ServerGpu is the same Gpu // NV_CHECK_OK_OR_RETURN(LEVEL_WARNING, - pGsync->gsyncHal.gsyncSetRasterSyncDecodeMode(pGpu, pGsync->pExtDev)); + pGsync->gsyncHal.gsyncSetRasterSyncDecodeMode(pGpu, pGpu, pGsync->pExtDev)); // // GPU will now be TS - Mark sync source for GPU on derived index. @@ -2631,7 +2632,7 @@ gsyncProgramMaster_P2060 Slaves = gsyncReadSlaves_P2060(pOtherGpu, pThis); if (Slaves) { - rmStatus = gsyncProgramSlaves_P2060(pOtherGpu, pThis, Slaves); + rmStatus = gsyncProgramSlaves_P2060(pOtherGpu, pGsync, Slaves); if (NV_OK != rmStatus) { NV_PRINTF(LEVEL_ERROR, @@ -2716,17 +2717,20 @@ static NV_STATUS gsyncProgramSlaves_P2060 ( OBJGPU *pGpu, - PDACP2060EXTERNALDEVICE pThis, + OBJGSYNC *pGsync, NvU32 Slaves ) { + DACP2060EXTERNALDEVICE *pThis = (DACP2060EXTERNALDEVICE *)pGsync->pExtDev; KernelDisplay *pKernelDisplay = GPU_GET_KERNEL_DISPLAY(pGpu); NvU32 DisplayIds[OBJ_MAX_HEADS]; NvU32 iface, head, index; NvU8 ctrl = 0, ctrl3 = 0; NvBool bCoupled, bHouseSelect, bLocalMaster, bEnableSlaves = (0 != Slaves); NV_STATUS rmStatus = NV_OK; - NvU32 numHeads = kdispGetNumHeads(pKernelDisplay); + NvU32 numHeads = kdispGetNumHeads(pKernelDisplay); + OBJSYS *pSys = SYS_GET_INSTANCE(); + OBJGSYNCMGR *pGsyncMgr = SYS_GET_GSYNCMGR(pSys); // This utility fn returns display id's associated with each head. extdevGetBoundHeadsAndDisplayIds(pGpu, DisplayIds); @@ -2811,6 +2815,72 @@ gsyncProgramSlaves_P2060 } } + // + // The RasterSyncDecodeMode of this Gsync board needs to get written if + // the server GPU is not on it. Find the server GPU and write based on that + // GPU's RasterSyncDecodeMode value. + // + if (bEnableSlaves && !bLocalMaster && pGsyncMgr->gsyncCount > 1) + { + OBJGPU *pServerGpu = NULL; + NvU32 otherGsyncIndex; + + // Loops only need to go until we find pServerGpu + for (otherGsyncIndex = 0; + (otherGsyncIndex < pGsyncMgr->gsyncCount) && (pServerGpu == NULL); + otherGsyncIndex++) + { + DACP2060EXTERNALDEVICE *pOtherExtDev = + (DACP2060EXTERNALDEVICE *)pGsyncMgr->gsyncTable[otherGsyncIndex].pExtDev; + NvU32 otherIfaceIndex; + + if (pOtherExtDev == pThis) + { + // + // If the server GPU is on this same GSync board, we don't need + // to write anything, so don't bother checking + // + continue; + } + + for (otherIfaceIndex = 0; + (otherIfaceIndex < NV_P2060_MAX_IFACES_PER_GSYNC) && (pServerGpu == NULL); + otherIfaceIndex++) + { + NvU32 otherHeadIndex; + NvU32 serverBitmask = 0; + + if (!pOtherExtDev->Iface[otherIfaceIndex].GpuInfo.connected) + { + continue; + } + + for (otherHeadIndex = 0; otherHeadIndex < OBJ_MAX_HEADS; otherHeadIndex++) + { + serverBitmask |= pOtherExtDev->Iface[otherIfaceIndex].Sync.Master[otherHeadIndex]; + } + + if (serverBitmask == 0) + { + continue; + } + + // This GPU is the server! + pServerGpu = gpumgrGetGpuFromId(pOtherExtDev->Iface[otherIfaceIndex].GpuInfo.gpuId); + } + } + + if (pServerGpu != NULL) + { + // + // Set the RasterSync Decode Mode + // This may return an error if the FW and GPU combination is invalid + // + NV_CHECK_OK_OR_RETURN(LEVEL_WARNING, + pGsync->gsyncHal.gsyncSetRasterSyncDecodeMode(pGpu, pServerGpu, pGsync->pExtDev)); + } + } + // // With House sync enabled the crashlocking still need some investigations. // So filter out Housesyced systems before doing local crashlocks. @@ -3926,13 +3996,13 @@ NV_STATUS gsyncRefSlaves_P2060 ( OBJGPU *pGpu, - PDACEXTERNALDEVICE pExtDev, + OBJGSYNC *pGsync, REFTYPE rType, NvU32 *pDisplayMasks, NvU32 *pRefresh ) { - PDACP2060EXTERNALDEVICE pThis = (PDACP2060EXTERNALDEVICE)pExtDev; + PDACP2060EXTERNALDEVICE pThis = (DACP2060EXTERNALDEVICE *)pGsync->pExtDev; NV_STATUS status = NV_OK; NvU32 Slaves = pThis->Slaves; NvU32 RefreshRate = pThis->RefreshRate; @@ -3952,7 +4022,7 @@ gsyncRefSlaves_P2060 switch ( rType ) { case refSetCommit: - status = gsyncProgramSlaves_P2060(pGpu, pThis, Slaves); + status = gsyncProgramSlaves_P2060(pGpu, pGsync, Slaves); break; case refFetchGet: diff --git a/src/nvidia/src/kernel/gpu/external_device/arch/pascal/kern_gsync_p2061.c b/src/nvidia/src/kernel/gpu/external_device/arch/pascal/kern_gsync_p2061.c index 950ad1c25..1054bf000 100644 --- a/src/nvidia/src/kernel/gpu/external_device/arch/pascal/kern_gsync_p2061.c +++ b/src/nvidia/src/kernel/gpu/external_device/arch/pascal/kern_gsync_p2061.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -256,25 +256,30 @@ gsyncGetSyncSkew_P2061_V204 return NV_OK; } +// // Determine and write the proper RasterSync Decode Mode to the CONTROL5 register +// The RasterSync Decode Mode will be queried from the Server GPU, which may +// not be the same GPU that we are writing this register via. +// NV_STATUS gsyncSetRasterSyncDecodeMode_P2061_V300 ( OBJGPU *pGpu, + OBJGPU *pServerGpu, DACEXTERNALDEVICE *pExtDev ) { NV2080_CTRL_INTERNAL_GSYNC_GET_RASTER_SYNC_DECODE_MODE_PARAMS rasterSyncDecodeModeParams; - RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); + RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pServerGpu); NvU8 old_data, data; // - // Get the raster sync mode flag from the GPU + // Get the raster sync mode flag from the server GPU // This is only used in P2061 v3.00+ for gsyncSetRasterDecodeMode() // - NV_ASSERT_OK_OR_RETURN(pRmApi->Control(pRmApi, pGpu->hInternalClient, - pGpu->hInternalSubdevice, NV2080_CTRL_CMD_INTERNAL_GSYNC_GET_RASTER_SYNC_DECODE_MODE, + NV_ASSERT_OK_OR_RETURN(pRmApi->Control(pRmApi, pServerGpu->hInternalClient, + pServerGpu->hInternalSubdevice, NV2080_CTRL_CMD_INTERNAL_GSYNC_GET_RASTER_SYNC_DECODE_MODE, &rasterSyncDecodeModeParams, sizeof(rasterSyncDecodeModeParams))); NV_ASSERT_OK_OR_RETURN(readregu008_extdeviceTargeted(pGpu, pExtDev, (NvU8)NV_P2061_CONTROL5, &data)); diff --git a/src/nvidia/src/kernel/gpu/external_device/gsync.c b/src/nvidia/src/kernel/gpu/external_device/gsync.c index 9abdec578..05669d3da 100644 --- a/src/nvidia/src/kernel/gpu/external_device/gsync.c +++ b/src/nvidia/src/kernel/gpu/external_device/gsync.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2008-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2008-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -1178,7 +1178,7 @@ gsyncIsAnyHeadFramelocked(OBJGSYNC *pGsync) { // Check if assigned slaves displays are there. if ((NV_OK == pGsync->gsyncHal.gsyncRefSlaves(pGpu, - pGsync->pExtDev, refRead, &assigned, &refresh)) && + pGsync, refRead, &assigned, &refresh)) && (assigned != 0)) { return NV_TRUE; @@ -1469,7 +1469,7 @@ gsyncGetControlSync(OBJGSYNC *pGsync, } else { - status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync->pExtDev, refFetchGet, + status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync, refFetchGet, &pParams->displays, &pParams->refresh); } } @@ -1545,10 +1545,10 @@ gsyncSetControlSync(OBJGSYNC *pGsync, } else { - status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync->pExtDev, + status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync, refRead, &assigned, &refresh); pParams->displays |= assigned; - status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync->pExtDev, + status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync, refSetCommit, &pParams->displays, &pParams->refresh); } @@ -1599,10 +1599,10 @@ gsyncSetControlUnsync(OBJGSYNC *pGsync, } else { - status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync->pExtDev, + status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync, refRead, &assigned, &refresh); pParams->displays = assigned & ~pParams->displays; - status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync->pExtDev, + status |= pGsync->gsyncHal.gsyncRefSlaves(pGpu, pGsync, refSetCommit, &pParams->displays, &refresh); } @@ -2385,7 +2385,7 @@ static NV_STATUS gsyncNullRefSlaves ( OBJGPU *pGpu, - PDACEXTERNALDEVICE pExtDev, + OBJGSYNC *pGsync, REFTYPE rType, NvU32 *pDisplayMasks, NvU32 *pRefresh @@ -2498,6 +2498,7 @@ static NV_STATUS gsyncNullSetRasterSyncDecodeMode ( OBJGPU *pGpu, + OBJGPU *pServerGpu, DACEXTERNALDEVICE *pExtDev ) { @@ -2507,11 +2508,11 @@ gsyncNullSetRasterSyncDecodeMode // NV2080_CTRL_INTERNAL_GSYNC_GET_RASTER_SYNC_DECODE_MODE_PARAMS rasterSyncDecodeModeParams; - RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); + RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pServerGpu); // Pre-3.00 FW can only use NV2080_CTRL_CMD_INTERNAL_GSYNC_GET_RASTER_SYNC_DECODE_MODE - NV_ASSERT_OK_OR_RETURN(pRmApi->Control(pRmApi, pGpu->hInternalClient, - pGpu->hInternalSubdevice, NV2080_CTRL_CMD_INTERNAL_GSYNC_GET_RASTER_SYNC_DECODE_MODE, + NV_ASSERT_OK_OR_RETURN(pRmApi->Control(pRmApi, pServerGpu->hInternalClient, + pServerGpu->hInternalSubdevice, NV2080_CTRL_CMD_INTERNAL_GSYNC_GET_RASTER_SYNC_DECODE_MODE, &rasterSyncDecodeModeParams, sizeof(rasterSyncDecodeModeParams))); NV_CHECK_OR_RETURN(LEVEL_WARNING, diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/blackwell/kernel_gsp_ecc_gb100.c b/src/nvidia/src/kernel/gpu/gsp/arch/blackwell/kernel_gsp_ecc_gb100.c index 95870bf31..469786048 100644 --- a/src/nvidia/src/kernel/gpu/gsp/arch/blackwell/kernel_gsp_ecc_gb100.c +++ b/src/nvidia/src/kernel/gpu/gsp/arch/blackwell/kernel_gsp_ecc_gb100.c @@ -137,4 +137,5 @@ kgspEccServiceUncorrError_GB100 gpuNotifySubDeviceEvent(pGpu, NV2080_NOTIFIERS_ECC_DBE, NULL, 0, 0, (NvU16)NV2080_CTRL_GPU_ECC_UNIT_GSP); nvErrorLog_va((void *)pGpu, errorType, "GSP-RISCV uncorrectable ECC error"); + nvErrorLog_va((void *)pGpu, UNCORRECTABLE_SRAM_ERROR, "GSP-RISCV, Uncorrectable SRAM error"); } diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator.c b/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator.c index 4b1f6c9c5..934db1fc9 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator.c @@ -534,7 +534,7 @@ pmaRegisterRegion PMA_SCRUB_INITIALIZE); } - status = pmaRegisterBlacklistInfo(pPma, physBase, pBlacklistPageBase, blacklistCount, NV_TRUE); + status = pmaRegisterBlacklistInfo(pPma, physBase, pBlacklistPageBase, blacklistCount); if (status != NV_OK) { pPma->pMapInfo->pmaMapDestroy(pMap); @@ -1971,7 +1971,7 @@ pmaAddToBlacklistTracking { blacklistPages.physOffset = physAddr; blacklistPages.bIsDynamic = NV_TRUE; - status = pmaRegisterBlacklistInfo(pPma, 0, &blacklistPages, 1, NV_FALSE); + status = pmaRegisterBlacklistInfo(pPma, 0, &blacklistPages, 1); } return status; } diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.c b/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.c index 793be0b9f..8e471147a 100644 --- a/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.c +++ b/src/nvidia/src/kernel/gpu/mem_mgr/phys_mem_allocator/phys_mem_allocator_util.c @@ -1239,8 +1239,7 @@ pmaRegisterBlacklistInfo PMA *pPma, NvU64 physAddrBase, PMA_BLACKLIST_ADDRESS *pBlacklistPageBase, - NvU32 blacklistCount, - NvBool bBlacklistFromInforom + NvU32 blacklistCount ) { NvU32 i; @@ -1288,34 +1287,24 @@ pmaRegisterBlacklistInfo // This is only needed for NUMA systems that auto online NUMA memory. // Other systems (e.g., P9) already do blacklisting in nvidia-persistenced. // + // Page blacklisting is done regardless of whether it will also be done by + // CPU RAS_FW via CPER handling (which is done in non-vGPU cases). + // This is fine, because multiple simultaneous calls to page blacklisting + // API memory_failure() do not cause any issues. + // if (pPma->bNuma && pPma->bNumaAutoOnline) { - // - // Only blacklist pages from inforom (i.e., during heap/PMA init) need - // to be blacklisted with kernel here. The blacklist pages stored in - // inforom need to remain blacklisted persistently across GPU resets - - // kernel won't automatically blacklist these so RM must do it - // explicitly here. - // - // Blacklist pages not from inforom (i.e., from ECC interrupt handling) - // do not need to be blacklisted with kernel. This is because the ECC - // interrupt will automatically trigger kernel itself to blacklist the page. - // - if (bBlacklistFromInforom) + NV_STATUS status; + + NV_PRINTF(LEVEL_INFO, + "NUMA enabled - blacklisting page through kernel at address 0x%llx (GPA) 0x%llx (SPA)\n", + pBlacklistPageBase[blacklistEntryIn].physOffset, + pBlacklistPageBase[blacklistEntryIn].physOffset + pPma->coherentCpuFbBase); + + status = osOfflinePageAtAddress(pBlacklistPageBase[blacklistEntryIn].physOffset + pPma->coherentCpuFbBase); + if (status != NV_OK) { - NV_STATUS status; - - // Use physOffset without 64K alignment, because kernel may use a different page size. - NV_PRINTF(LEVEL_INFO, - "NUMA enabled - blacklisting page through kernel at address 0x%llx (GPA) 0x%llx (SPA)\n", - pBlacklistPageBase[blacklistEntryIn].physOffset, - pBlacklistPageBase[blacklistEntryIn].physOffset + pPma->coherentCpuFbBase); - - status = osOfflinePageAtAddress(pBlacklistPageBase[blacklistEntryIn].physOffset + pPma->coherentCpuFbBase); - if (status != NV_OK) - { - NV_PRINTF(LEVEL_ERROR, "osOfflinePageAtAddress() failed with status: %d\n", status); - } + NV_PRINTF(LEVEL_ERROR, "osOfflinePageAtAddress() failed with status: %d\n", status); } } diff --git a/src/nvidia/src/kernel/gpu/mig_mgr/arch/ampere/kmigmgr_ga100.c b/src/nvidia/src/kernel/gpu/mig_mgr/arch/ampere/kmigmgr_ga100.c index e6132fada..2e0ee9f2d 100644 --- a/src/nvidia/src/kernel/gpu/mig_mgr/arch/ampere/kmigmgr_ga100.c +++ b/src/nvidia/src/kernel/gpu/mig_mgr/arch/ampere/kmigmgr_ga100.c @@ -173,6 +173,11 @@ kmigmgrIsGPUInstanceFlagValid_GA100 NvU32 gfxSizeFlag = DRF_VAL(2080_CTRL_GPU, _PARTITION_FLAG, _GFX_SIZE, gpuInstanceFlag); + if (!FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DEFAULT, gpuInstanceFlag)) + { + return NV_FALSE; + } + switch (memSizeFlag) { case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_FULL: diff --git a/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb100.c b/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb100.c index 9a4d33192..87287d944 100644 --- a/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb100.c +++ b/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb100.c @@ -50,6 +50,11 @@ kmigmgrIsGPUInstanceFlagValid_GB100 NvU32 gfxSizeFlag = DRF_VAL(2080_CTRL_GPU, _PARTITION_FLAG, _GFX_SIZE, gpuInstanceFlag); + if (!FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DEFAULT, gpuInstanceFlag)) + { + return NV_FALSE; + } + switch (memSizeFlag) { case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_FULL: diff --git a/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb10b.c b/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb10b.c index c30138001..2f2738d4f 100644 --- a/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb10b.c +++ b/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb10b.c @@ -145,6 +145,11 @@ kmigmgrIsGPUInstanceFlagValid_GB10B NvU32 gfxSizeFlag = DRF_VAL(2080_CTRL_GPU, _PARTITION_FLAG, _GFX_SIZE, gpuInstanceFlag); + if (!FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DEFAULT, gpuInstanceFlag)) + { + return NV_FALSE; + } + switch (memSizeFlag) { case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_FULL: diff --git a/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb202.c b/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb202.c index edc69ce54..0cf164471 100644 --- a/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb202.c +++ b/src/nvidia/src/kernel/gpu/mig_mgr/arch/blackwell/kmigmgr_gb202.c @@ -50,6 +50,14 @@ kmigmgrIsGPUInstanceFlagValid_GB202 NvU32 gfxSizeFlag = DRF_VAL(2080_CTRL_GPU, _PARTITION_FLAG, _GFX_SIZE, gpuInstanceFlag); + // If incorrect all video flag, then fail + if (!(FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DEFAULT, gpuInstanceFlag) || + FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _ENABLE, gpuInstanceFlag) || + FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DISABLE, gpuInstanceFlag))) + { + return NV_FALSE; + } + switch (memSizeFlag) { case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_FULL: @@ -141,6 +149,11 @@ kmigmgrIsGPUInstanceCombinationValid_GB202 { return NV_FALSE; } + + if (!FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DEFAULT, gpuInstanceFlag)) + { + return NV_FALSE; + } } switch (computeSizeFlag) @@ -148,6 +161,8 @@ kmigmgrIsGPUInstanceCombinationValid_GB202 case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_FULL: NV_CHECK_OR_RETURN(LEVEL_SILENT, memSizeFlag == NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_FULL, NV_FALSE); + NV_CHECK_OR_RETURN(LEVEL_SILENT, FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DEFAULT, + gpuInstanceFlag), NV_FALSE); break; case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_HALF: NV_CHECK_OR_RETURN(LEVEL_SILENT, memSizeFlag == NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_HALF, diff --git a/src/nvidia/src/kernel/gpu/mig_mgr/arch/hopper/kmigmgr_gh100.c b/src/nvidia/src/kernel/gpu/mig_mgr/arch/hopper/kmigmgr_gh100.c index 860ededee..71dff99f9 100644 --- a/src/nvidia/src/kernel/gpu/mig_mgr/arch/hopper/kmigmgr_gh100.c +++ b/src/nvidia/src/kernel/gpu/mig_mgr/arch/hopper/kmigmgr_gh100.c @@ -51,6 +51,12 @@ kmigmgrIsGPUInstanceFlagValid_GH100 _COMPUTE_SIZE, gpuInstanceFlag); NvU32 gfxSizeFlag = DRF_VAL(2080_CTRL_GPU, _PARTITION_FLAG, _GFX_SIZE, gpuInstanceFlag); + + if (!FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_REQ_ALL_MEDIA, _DEFAULT, gpuInstanceFlag)) + { + return NV_FALSE; + } + switch (memSizeFlag) { case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_FULL: diff --git a/src/nvidia/src/kernel/gpu/mig_mgr/gpu_instance_subscription.c b/src/nvidia/src/kernel/gpu/mig_mgr/gpu_instance_subscription.c index cb4d43174..1fa2ce5bd 100644 --- a/src/nvidia/src/kernel/gpu/mig_mgr/gpu_instance_subscription.c +++ b/src/nvidia/src/kernel/gpu/mig_mgr/gpu_instance_subscription.c @@ -442,7 +442,7 @@ gisubscriptionCtrlCmdExecPartitionsCreate_IMPL .inst.request.requestFlags = pParams->flags }; - if (!hypervisorIsVgxHyper()) + if (!gpuIsSriovEnabled(pGpu)) { request.inst.request.requestFlags = FLD_SET_DRF(C637_CTRL, _DMA_EXEC_PARTITIONS_CREATE_REQUEST, _WITH_PART_ID, _FALSE, request.inst.request.requestFlags); } diff --git a/src/nvidia/src/kernel/rmapi/control.c b/src/nvidia/src/kernel/rmapi/control.c index 927976cd7..b99f4609d 100644 --- a/src/nvidia/src/kernel/rmapi/control.c +++ b/src/nvidia/src/kernel/rmapi/control.c @@ -728,7 +728,7 @@ NV_STATUS serverControl_ValidateCookie (pRmCtrlExecuteCookie->ctrlFlags & RMCTRL_FLAGS_ROUTE_TO_PHYSICAL) && !(pRmCtrlExecuteCookie->ctrlFlags & (RMCTRL_FLAGS_ROUTE_TO_VGPU_HOST | RMCTRL_FLAGS_PHYSICAL_IMPLEMENTED_ON_VGPU_GUEST))) { - if (!rmapiutilSkipErrorMessageForUnsupportedVgpuGuestControl(pRmCtrlParams->cmd)) + if (!rmapiutilSkipErrorMessageForUnsupportedVgpuGuestControl(pRmCtrlParams->pGpu, pRmCtrlParams->cmd)) { NV_PRINTF(LEVEL_ERROR, "Unsupported ROUTE_TO_PHYSICAL control 0x%x was called on vGPU guest\n", pRmCtrlParams->cmd); } diff --git a/src/nvidia/src/kernel/rmapi/rmapi_utils.c b/src/nvidia/src/kernel/rmapi/rmapi_utils.c index 439792d0a..283246ebc 100644 --- a/src/nvidia/src/kernel/rmapi/rmapi_utils.c +++ b/src/nvidia/src/kernel/rmapi/rmapi_utils.c @@ -35,6 +35,7 @@ #include "ctrl/ctrl2080.h" #include "ctrl/ctrl402c.h" #include "ctrl/ctrl90cc.h" +#include "ctrl/ctrl90e6.h" #include "ctrl/ctrl90e7/ctrl90e7bbx.h" NV_STATUS @@ -194,7 +195,7 @@ rmapiutilGetControlInfo return NV_ERR_OBJECT_NOT_FOUND; } -NvBool rmapiutilSkipErrorMessageForUnsupportedVgpuGuestControl(NvU32 cmd) +NvBool rmapiutilSkipErrorMessageForUnsupportedVgpuGuestControl(OBJGPU *pGpu, NvU32 cmd) { switch (cmd) { @@ -239,6 +240,10 @@ NvBool rmapiutilSkipErrorMessageForUnsupportedVgpuGuestControl(NvU32 cmd) case NV90E7_CTRL_CMD_BBX_GET_LAST_FLUSH_TIME: return NV_TRUE; + case NV90E6_CTRL_CMD_MASTER_GET_ERROR_INTR_OFFSET_MASK: + if (!IsTURINGorBetter(pGpu)) + return NV_TRUE; + default: return NV_FALSE; } diff --git a/src/nvidia/src/kernel/vgpu/vgpu_util.c b/src/nvidia/src/kernel/vgpu/vgpu_util.c index eae0b524e..735b8a9d2 100644 --- a/src/nvidia/src/kernel/vgpu/vgpu_util.c +++ b/src/nvidia/src/kernel/vgpu/vgpu_util.c @@ -267,7 +267,7 @@ static NV_STATUS vgpuExpandSysmemPfnBitMapList(OBJGPU *pGpu, NvU64 pfn) vgpuSysmemPfnInfo.guestMaxPfn = node->nodeEndPfn; vgpuSysmemPfnInfo.sizeInBytes = vgpuSysmemPfnInfo.guestMaxPfn / 8; - } while (vgpuSysmemPfnInfo.guestMaxPfn < pfn); + } while (vgpuSysmemPfnInfo.guestMaxPfn <= pfn); // Alloc the ref count buffer temp_pfn_ref_count = portMemAllocNonPaged(sizeof(NvU16) * vgpuSysmemPfnInfo.guestMaxPfn); @@ -445,7 +445,7 @@ NV_STATUS vgpuUpdateSysmemPfnBitMap if (bAlloc) { - if (pfn > vgpuSysmemPfnInfo.guestMaxPfn) + if (pfn >= vgpuSysmemPfnInfo.guestMaxPfn) { NV_PRINTF(LEVEL_INFO, "Update sysmem pfn bitmap for pfn: 0x%llx > guestMaxPfn: 0x%llx\n", pfn, vgpuSysmemPfnInfo.guestMaxPfn); diff --git a/version.mk b/version.mk index fe94e16c7..44541e652 100644 --- a/version.mk +++ b/version.mk @@ -1,4 +1,4 @@ -NVIDIA_VERSION = 570.153.02 +NVIDIA_VERSION = 570.158.01 # This file. VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))