mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2026-01-27 11:39:46 +00:00
535.104.12
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
|
||||
## Release 535 Entries
|
||||
|
||||
### [535.104.12] 2023-09-25
|
||||
|
||||
### [535.104.05] 2023-08-22
|
||||
|
||||
### [535.98] 2023-08-08
|
||||
|
||||
12
README.md
12
README.md
@@ -1,7 +1,7 @@
|
||||
# NVIDIA Linux Open GPU Kernel Module Source
|
||||
|
||||
This is the source release of the NVIDIA Linux open GPU kernel modules,
|
||||
version 535.104.05.
|
||||
version 535.104.12.
|
||||
|
||||
|
||||
## How to Build
|
||||
@@ -17,7 +17,7 @@ as root:
|
||||
|
||||
Note that the kernel modules built here must be used with GSP
|
||||
firmware and user-space NVIDIA GPU driver components from a corresponding
|
||||
535.104.05 driver release. This can be achieved by installing
|
||||
535.104.12 driver release. This can be achieved by installing
|
||||
the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
|
||||
option. E.g.,
|
||||
|
||||
@@ -180,7 +180,7 @@ software applications.
|
||||
## Compatible GPUs
|
||||
|
||||
The open-gpu-kernel-modules can be used on any Turing or later GPU
|
||||
(see the table below). However, in the 535.104.05 release,
|
||||
(see the table below). However, in the 535.104.12 release,
|
||||
GeForce and Workstation support is still considered alpha-quality.
|
||||
|
||||
To enable use of the open kernel modules on GeForce and Workstation GPUs,
|
||||
@@ -188,7 +188,7 @@ set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
|
||||
parameter to 1. For more details, see the NVIDIA GPU driver end user
|
||||
README here:
|
||||
|
||||
https://us.download.nvidia.com/XFree86/Linux-x86_64/535.104.05/README/kernel_open.html
|
||||
https://us.download.nvidia.com/XFree86/Linux-x86_64/535.104.12/README/kernel_open.html
|
||||
|
||||
In the below table, if three IDs are listed, the first is the PCI Device
|
||||
ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
|
||||
@@ -856,6 +856,10 @@ Subsystem Device ID.
|
||||
| NVIDIA RTX 4000 SFF Ada Generation | 27B0 103C 16FA |
|
||||
| NVIDIA RTX 4000 SFF Ada Generation | 27B0 10DE 16FA |
|
||||
| NVIDIA RTX 4000 SFF Ada Generation | 27B0 17AA 16FA |
|
||||
| NVIDIA RTX 4500 Ada Generation | 27B1 1028 180C |
|
||||
| NVIDIA RTX 4500 Ada Generation | 27B1 103C 180C |
|
||||
| NVIDIA RTX 4500 Ada Generation | 27B1 10DE 180C |
|
||||
| NVIDIA RTX 4500 Ada Generation | 27B1 17AA 180C |
|
||||
| NVIDIA RTX 4000 Ada Generation | 27B2 1028 181B |
|
||||
| NVIDIA RTX 4000 Ada Generation | 27B2 103C 181B |
|
||||
| NVIDIA RTX 4000 Ada Generation | 27B2 10DE 181B |
|
||||
|
||||
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
|
||||
EXTRA_CFLAGS += -I$(src)
|
||||
EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
|
||||
EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
|
||||
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.104.05\"
|
||||
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.104.12\"
|
||||
|
||||
ifneq ($(SYSSRCHOST1X),)
|
||||
EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
|
||||
|
||||
@@ -5743,23 +5743,25 @@ compile_test() {
|
||||
compile_check_conftest "$CODE" "NV_IOASID_GET_PRESENT" "" "functions"
|
||||
;;
|
||||
|
||||
mm_pasid_set)
|
||||
mm_pasid_drop)
|
||||
#
|
||||
# Determine if mm_pasid_set() function is present
|
||||
# Determine if mm_pasid_drop() function is present
|
||||
#
|
||||
# Added by commit 701fac40384f ("iommu/sva: Assign a PASID to mm
|
||||
# on PASID allocation and free it on mm exit") in v5.18.
|
||||
# Moved to linux/iommu.h in commit cd3891158a77 ("iommu/sva: Move
|
||||
# PASID helpers to sva code") in v6.4.
|
||||
#
|
||||
# mm_pasid_set() function was added by commit
|
||||
# 701fac40384f07197b106136012804c3cae0b3de (iommu/sva: Assign a
|
||||
# PASID to mm on PASID allocation and free it on mm exit) in v5.18.
|
||||
# (2022-02-15).
|
||||
CODE="
|
||||
#if defined(NV_LINUX_SCHED_MM_H_PRESENT)
|
||||
#include <linux/sched/mm.h>
|
||||
#endif
|
||||
void conftest_mm_pasid_set(void) {
|
||||
mm_pasid_set();
|
||||
#include <linux/iommu.h>
|
||||
void conftest_mm_pasid_drop(void) {
|
||||
mm_pasid_drop();
|
||||
}"
|
||||
|
||||
compile_check_conftest "$CODE" "NV_MM_PASID_SET_PRESENT" "" "functions"
|
||||
compile_check_conftest "$CODE" "NV_MM_PASID_DROP_PRESENT" "" "functions"
|
||||
;;
|
||||
|
||||
drm_crtc_state_has_no_vblank)
|
||||
|
||||
@@ -81,7 +81,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
|
||||
|
||||
@@ -32,19 +32,23 @@
|
||||
// For ATS support on aarch64, arm_smmu_sva_bind() is needed for
|
||||
// iommu_sva_bind_device() calls. Unfortunately, arm_smmu_sva_bind() is not
|
||||
// conftest-able. We instead look for the presence of ioasid_get() or
|
||||
// mm_pasid_set(). ioasid_get() was added in the same patch series as
|
||||
// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_set() was added in the
|
||||
// mm_pasid_drop(). ioasid_get() was added in the same patch series as
|
||||
// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_drop() was added in the
|
||||
// same patch as the removal of ioasid_get(). We assume the presence of
|
||||
// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_set(v5.18+) is
|
||||
// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_drop(v5.18+) is
|
||||
// present.
|
||||
//
|
||||
// arm_smmu_sva_bind() was added with commit
|
||||
// 32784a9562fb0518b12e9797ee2aec52214adf6f and ioasid_get() was added with
|
||||
// commit cb4789b0d19ff231ce9f73376a023341300aed96 (11/23/2020). Commit
|
||||
// 701fac40384f07197b106136012804c3cae0b3de (02/15/2022) removed ioasid_get()
|
||||
// and added mm_pasid_set().
|
||||
#if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_SET_PRESENT))
|
||||
#define UVM_ATS_SVA_SUPPORTED() 1
|
||||
// and added mm_pasid_drop().
|
||||
#if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT))
|
||||
#if defined(CONFIG_IOMMU_SVA)
|
||||
#define UVM_ATS_SVA_SUPPORTED() 1
|
||||
#else
|
||||
#define UVM_ATS_SVA_SUPPORTED() 0
|
||||
#endif
|
||||
#else
|
||||
#define UVM_ATS_SVA_SUPPORTED() 0
|
||||
#endif
|
||||
|
||||
@@ -43,18 +43,18 @@
|
||||
#endif
|
||||
|
||||
#if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
|
||||
#define NV_BUILD_BRANCH_VERSION "rel/gpu_drv/r535/r537_13-260"
|
||||
#define NV_BUILD_CHANGELIST_NUM (33206197)
|
||||
#define NV_BUILD_BRANCH_VERSION "rel/gpu_drv/r535/r537_13-267"
|
||||
#define NV_BUILD_CHANGELIST_NUM (33312039)
|
||||
#define NV_BUILD_TYPE "Official"
|
||||
#define NV_BUILD_NAME "rel/gpu_drv/r535/r537_13-260"
|
||||
#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33206197)
|
||||
#define NV_BUILD_NAME "rel/gpu_drv/r535/r537_13-267"
|
||||
#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33312039)
|
||||
|
||||
#else /* Windows builds */
|
||||
#define NV_BUILD_BRANCH_VERSION "r537_13-1"
|
||||
#define NV_BUILD_CHANGELIST_NUM (33194057)
|
||||
#define NV_BUILD_BRANCH_VERSION "r537_13-7"
|
||||
#define NV_BUILD_CHANGELIST_NUM (33274399)
|
||||
#define NV_BUILD_TYPE "Official"
|
||||
#define NV_BUILD_NAME "537.17"
|
||||
#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33194057)
|
||||
#define NV_BUILD_NAME "537.39"
|
||||
#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33274399)
|
||||
#define NV_BUILD_BRANCH_BASE_VERSION R535
|
||||
#endif
|
||||
// End buildmeister python edited section
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
|
||||
(defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)
|
||||
|
||||
#define NV_VERSION_STRING "535.104.05"
|
||||
#define NV_VERSION_STRING "535.104.12"
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __gh100_dev_fb_h_
|
||||
#define __gh100_dev_fb_h_
|
||||
#define NV_PFB_NISO_FLUSH_SYSMEM_ADDR_SHIFT 8 /* */
|
||||
@@ -29,4 +29,25 @@
|
||||
#define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI 0x00100A38 /* RW-4R */
|
||||
#define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI_ADR 31:0 /* RWIVF */
|
||||
#define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI_ADR_MASK 0x000FFFFF /* ----V */
|
||||
|
||||
#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT 0x00100E78 /* RW-4R */
|
||||
#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT 0x00100E78 /* RW-4R */
|
||||
#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL 15:0 /* RWEVF */
|
||||
#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT 0 /* RWE-V */
|
||||
#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE 31:16 /* RWEVF */
|
||||
#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT 0 /* RWE-V */
|
||||
|
||||
#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT 0x00100E8C /* RW-4R */
|
||||
#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT 0x00100E8C /* RW-4R */
|
||||
#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL 15:0 /* RWEVF */
|
||||
#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT 0 /* RWE-V */
|
||||
#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE 31:16 /* RWEVF */
|
||||
#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT 0 /* RWE-V */
|
||||
|
||||
#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT 0x00100EA0 /* RW-4R */
|
||||
#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT 0x00100EA0 /* RW-4R */
|
||||
#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_TOTAL 15:0 /* RWEVF */
|
||||
#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT 0 /* RWE-V */
|
||||
#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_UNIQUE 31:16 /* RWEVF */
|
||||
#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT 0 /* RWE-V */
|
||||
#endif // __gh100_dev_fb_h_
|
||||
|
||||
29
src/common/inc/swref/published/hopper/gh100/dev_fbpa.h
Normal file
29
src/common/inc/swref/published/hopper/gh100/dev_fbpa.h
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef __gh100_dev_fbpa_h_
|
||||
#define __gh100_dev_fbpa_h_
|
||||
|
||||
#define NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1 4 /* */
|
||||
#define NV_PFB_FBPA_0_ECC_DED_COUNT(i) (0x009025A0+(i)*4) /* RW-4A */
|
||||
#endif // __gh100_dev_fbpa_h_
|
||||
33
src/common/inc/swref/published/hopper/gh100/dev_ltc.h
Normal file
33
src/common/inc/swref/published/hopper/gh100/dev_ltc.h
Normal file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef __gh100_dev_ltc_h_
|
||||
#define __gh100_dev_ltc_h_
|
||||
|
||||
#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT 0x001404f8 /* RW-4R */
|
||||
#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_TOTAL 15:0 /* RWIVF */
|
||||
#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT 0x0000 /* RWI-V */
|
||||
#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_UNIQUE 31:16 /* RWIVF */
|
||||
#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT 0x0000 /* RWI-V */
|
||||
|
||||
#endif // __gh100_dev_ltc_h_
|
||||
52
src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h
Normal file
52
src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h
Normal file
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef __gh100_dev_nv_xpl_h_
|
||||
#define __gh100_dev_nv_xpl_h_
|
||||
#define NV_XPL_DL_ERR_COUNT_RBUF 0x00000a54 /* R--4R */
|
||||
#define NV_XPL_DL_ERR_COUNT_RBUF__PRIV_LEVEL_MASK 0x00000b08 /* */
|
||||
#define NV_XPL_DL_ERR_COUNT_RBUF_CORR_ERR 15:0 /* R-EVF */
|
||||
#define NV_XPL_DL_ERR_COUNT_RBUF_CORR_ERR_INIT 0x0000 /* R-E-V */
|
||||
#define NV_XPL_DL_ERR_COUNT_RBUF_UNCORR_ERR 31:16 /* R-EVF */
|
||||
#define NV_XPL_DL_ERR_COUNT_RBUF_UNCORR_ERR_INIT 0x0000 /* R-E-V */
|
||||
#define NV_XPL_DL_ERR_COUNT_SEQ_LUT 0x00000a58 /* R--4R */
|
||||
#define NV_XPL_DL_ERR_COUNT_SEQ_LUT__PRIV_LEVEL_MASK 0x00000b08 /* */
|
||||
#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_CORR_ERR 15:0 /* R-EVF */
|
||||
#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_CORR_ERR_INIT 0x0000 /* R-E-V */
|
||||
#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_UNCORR_ERR 31:16 /* R-EVF */
|
||||
#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_UNCORR_ERR_INIT 0x0000 /* R-E-V */
|
||||
|
||||
#define NV_XPL_DL_ERR_RESET 0x00000a5c /* RW-4R */
|
||||
#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT 0:0 /* RWCVF */
|
||||
#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT_DONE 0x0 /* RWC-V */
|
||||
#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT_PENDING 0x1 /* -W--T */
|
||||
#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT 1:1 /* RWCVF */
|
||||
#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT_DONE 0x0 /* RWC-V */
|
||||
#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT_PENDING 0x1 /* -W--T */
|
||||
#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT 16:16 /* RWCVF */
|
||||
#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT_DONE 0x0 /* RWC-V */
|
||||
#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT_PENDING 0x1 /* -W--T */
|
||||
#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT 17:17 /* RWCVF */
|
||||
#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT_DONE 0x0 /* RWC-V */
|
||||
#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT_PENDING 0x1 /* -W--T */
|
||||
#endif // __gh100_dev_nv_xpl_h__
|
||||
@@ -24,4 +24,7 @@
|
||||
#ifndef __gh100_dev_xtl_ep_pri_h__
|
||||
#define __gh100_dev_xtl_ep_pri_h__
|
||||
#define NV_EP_PCFGM 0x92FFF:0x92000 /* RW--D */
|
||||
|
||||
#define NV_XTL_EP_PRI_DED_ERROR_STATUS 0x0000043C /* RW-4R */
|
||||
#define NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS 0x000003C8 /* RW-4R */
|
||||
#endif // __gh100_dev_xtl_ep_pri_h__
|
||||
|
||||
@@ -21,3 +21,9 @@
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#define NV_CHIP_EXTENDED_SYSTEM_PHYSICAL_ADDRESS_BITS 52
|
||||
#define NV_LTC_PRI_STRIDE 8192
|
||||
#define NV_LTS_PRI_STRIDE 512
|
||||
#define NV_FBPA_PRI_STRIDE 16384
|
||||
#define NV_SCAL_LITTER_NUM_FBPAS 24
|
||||
#define NV_XPL_BASE_ADDRESS 540672
|
||||
#define NV_XTL_BASE_ADDRESS 593920
|
||||
|
||||
@@ -47,5 +47,17 @@
|
||||
#define NV_XAL_EP_INTR_0_PRI_RSP_TIMEOUT 3:3
|
||||
#define NV_XAL_EP_INTR_0_PRI_RSP_TIMEOUT_PENDING 0x1
|
||||
#define NV_XAL_EP_SCPM_PRI_DUMMY_DATA_PATTERN_INIT 0xbadf0200
|
||||
|
||||
#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT 0x0010f364 /* RW-4R */
|
||||
#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_TOTAL 15:0 /* RWIUF */
|
||||
#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT 0x0000 /* RWI-V */
|
||||
#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_UNIQUE 31:16 /* RWIUF */
|
||||
#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT 0x0000 /* RWI-V */
|
||||
|
||||
#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT 0x0010f37c /* RW-4R */
|
||||
#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_TOTAL 15:0 /* RWIUF */
|
||||
#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT 0x0000 /* RWI-V */
|
||||
#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_UNIQUE 31:16 /* RWIUF */
|
||||
#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT 0x0000 /* RWI-V */
|
||||
#endif // __gh100_pri_nv_xal_ep_h__
|
||||
|
||||
|
||||
@@ -1542,6 +1542,12 @@ nvswitch_reset_and_train_link_ls10
|
||||
nvswitch_execute_unilateral_link_shutdown_ls10(link);
|
||||
nvswitch_corelib_clear_link_state_ls10(link);
|
||||
|
||||
//
|
||||
// When a link faults there could be a race between the driver requesting
|
||||
// reset and MINION processing Emergency Shutdown. Minion will notify if
|
||||
// such a collision happens and will deny the reset request, so try the
|
||||
// request up to 3 times
|
||||
//
|
||||
do
|
||||
{
|
||||
status = nvswitch_request_tl_link_state_ls10(link,
|
||||
@@ -1597,15 +1603,18 @@ nvswitch_reset_and_train_link_ls10
|
||||
"%s: NvLink Reset has failed for link %d\n",
|
||||
__FUNCTION__, link->linkNumber);
|
||||
|
||||
// Re-register links.
|
||||
status = nvlink_lib_register_link(device->nvlink_device, link);
|
||||
if (status != NVL_SUCCESS)
|
||||
{
|
||||
nvswitch_destroy_link(link);
|
||||
return status;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
status = nvswitch_launch_ALI_link_training(device, link, NV_FALSE);
|
||||
if (status != NVL_SUCCESS)
|
||||
{
|
||||
NVSWITCH_PRINT(device, ERROR,
|
||||
"%s: NvLink failed to request ACTIVE for link %d\n",
|
||||
__FUNCTION__, link->linkNumber);
|
||||
return status;
|
||||
}
|
||||
|
||||
return NVL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -1345,7 +1345,6 @@ nvswitch_lib_initialize_device
|
||||
NvU8 link_num;
|
||||
nvlink_link *link = NULL;
|
||||
NvBool is_blacklisted_by_os = NV_FALSE;
|
||||
NvU64 mode;
|
||||
|
||||
if (!NVSWITCH_IS_DEVICE_ACCESSIBLE(device))
|
||||
{
|
||||
@@ -1508,18 +1507,6 @@ nvswitch_lib_initialize_device
|
||||
|
||||
nvswitch_reset_persistent_link_hw_state(device, link_num);
|
||||
|
||||
if(_nvswitch_corelib_get_dl_link_mode(link, &mode) != NVL_SUCCESS)
|
||||
{
|
||||
NVSWITCH_PRINT(device, ERROR, "%s: nvlipt_lnk_status: Failed to check link mode! LinkId %d\n",
|
||||
__FUNCTION__, link_num);
|
||||
}
|
||||
else if(mode == NVLINK_LINKSTATE_FAULT)
|
||||
{
|
||||
NVSWITCH_PRINT(device, INFO, "%s: retraining LinkId %d\n",
|
||||
__FUNCTION__, link_num);
|
||||
nvswitch_reset_and_train_link(device, link);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
retval = nvswitch_set_training_mode(device);
|
||||
@@ -1623,6 +1610,10 @@ nvswitch_lib_post_init_device
|
||||
)
|
||||
{
|
||||
NvlStatus retval;
|
||||
NvlStatus status;
|
||||
NvU32 link_num;
|
||||
NvU64 mode;
|
||||
nvlink_link *link;
|
||||
|
||||
if (!NVSWITCH_IS_DEVICE_INITIALIZED(device))
|
||||
{
|
||||
@@ -1634,7 +1625,7 @@ nvswitch_lib_post_init_device
|
||||
{
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
||||
if (nvswitch_is_bios_supported(device))
|
||||
{
|
||||
retval = nvswitch_bios_get_image(device);
|
||||
@@ -1670,6 +1661,41 @@ nvswitch_lib_post_init_device
|
||||
(void)nvswitch_launch_ALI(device);
|
||||
}
|
||||
|
||||
//
|
||||
// There is an edge case where a hypervisor may not send same number
|
||||
// of reset to switch and GPUs, so try to re-train links in fault
|
||||
// if possible
|
||||
//
|
||||
for (link_num=0; link_num < nvswitch_get_num_links(device); link_num++)
|
||||
{
|
||||
// Sanity check
|
||||
if (!nvswitch_is_link_valid(device, link_num))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
status = nvlink_lib_get_link(device->nvlink_device, link_num, &link);
|
||||
if (status != NVL_SUCCESS)
|
||||
{
|
||||
NVSWITCH_PRINT(device, ERROR, "%s: Failed to get link for LinkId %d\n",
|
||||
__FUNCTION__, link_num);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the link is in fault then re-train
|
||||
if(_nvswitch_corelib_get_dl_link_mode(link, &mode) != NVL_SUCCESS)
|
||||
{
|
||||
NVSWITCH_PRINT(device, ERROR, "%s: nvlipt_lnk_status: Failed to check link mode! LinkId %d\n",
|
||||
__FUNCTION__, link_num);
|
||||
}
|
||||
else if(mode == NVLINK_LINKSTATE_FAULT)
|
||||
{
|
||||
NVSWITCH_PRINT(device, INFO, "%s: retraining LinkId %d\n",
|
||||
__FUNCTION__, link_num);
|
||||
nvswitch_reset_and_train_link(device, link);
|
||||
}
|
||||
}
|
||||
|
||||
return NVL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -121,7 +121,8 @@
|
||||
#define NVLINK_FLA_PRIV_ERR (137)
|
||||
#define ROBUST_CHANNEL_DLA_ERROR (138)
|
||||
#define ROBUST_CHANNEL_FAST_PATH_ERROR (139)
|
||||
#define ROBUST_CHANNEL_LAST_ERROR (ROBUST_CHANNEL_FAST_PATH_ERROR)
|
||||
#define UNRECOVERABLE_ECC_ERROR_ESCAPE (140)
|
||||
#define ROBUST_CHANNEL_LAST_ERROR (UNRECOVERABLE_ECC_ERROR_ESCAPE)
|
||||
|
||||
|
||||
// Indexed CE reference
|
||||
|
||||
@@ -492,6 +492,17 @@ static void __nvoc_init_funcTable_OBJGPU_1(OBJGPU *pThis) {
|
||||
pThis->__gpuWriteFunctionConfigRegEx__ = &gpuWriteFunctionConfigRegEx_GM107;
|
||||
}
|
||||
|
||||
// Hal function -- gpuReadVgpuConfigReg
|
||||
if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */
|
||||
{
|
||||
pThis->__gpuReadVgpuConfigReg__ = &gpuReadVgpuConfigReg_GH100;
|
||||
}
|
||||
// default
|
||||
else
|
||||
{
|
||||
pThis->__gpuReadVgpuConfigReg__ = &gpuReadVgpuConfigReg_46f6a7;
|
||||
}
|
||||
|
||||
// Hal function -- gpuGetIdInfo
|
||||
if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */
|
||||
{
|
||||
|
||||
@@ -877,6 +877,7 @@ struct OBJGPU {
|
||||
NV_STATUS (*__gpuReadFunctionConfigReg__)(struct OBJGPU *, NvU32, NvU32, NvU32 *);
|
||||
NV_STATUS (*__gpuWriteFunctionConfigReg__)(struct OBJGPU *, NvU32, NvU32, NvU32);
|
||||
NV_STATUS (*__gpuWriteFunctionConfigRegEx__)(struct OBJGPU *, NvU32, NvU32, NvU32, THREAD_STATE_NODE *);
|
||||
NV_STATUS (*__gpuReadVgpuConfigReg__)(struct OBJGPU *, NvU32, NvU32 *);
|
||||
void (*__gpuGetIdInfo__)(struct OBJGPU *);
|
||||
void (*__gpuHandleSanityCheckRegReadError__)(struct OBJGPU *, NvU32, NvU32);
|
||||
void (*__gpuHandleSecFault__)(struct OBJGPU *);
|
||||
@@ -1427,6 +1428,8 @@ NV_STATUS __nvoc_objCreate_OBJGPU(OBJGPU**, Dynamic*, NvU32,
|
||||
#define gpuWriteFunctionConfigReg_HAL(pGpu, function, reg, data) gpuWriteFunctionConfigReg_DISPATCH(pGpu, function, reg, data)
|
||||
#define gpuWriteFunctionConfigRegEx(pGpu, function, reg, data, pThreadState) gpuWriteFunctionConfigRegEx_DISPATCH(pGpu, function, reg, data, pThreadState)
|
||||
#define gpuWriteFunctionConfigRegEx_HAL(pGpu, function, reg, data, pThreadState) gpuWriteFunctionConfigRegEx_DISPATCH(pGpu, function, reg, data, pThreadState)
|
||||
#define gpuReadVgpuConfigReg(pGpu, index, data) gpuReadVgpuConfigReg_DISPATCH(pGpu, index, data)
|
||||
#define gpuReadVgpuConfigReg_HAL(pGpu, index, data) gpuReadVgpuConfigReg_DISPATCH(pGpu, index, data)
|
||||
#define gpuGetIdInfo(pGpu) gpuGetIdInfo_DISPATCH(pGpu)
|
||||
#define gpuGetIdInfo_HAL(pGpu) gpuGetIdInfo_DISPATCH(pGpu)
|
||||
#define gpuHandleSanityCheckRegReadError(pGpu, addr, value) gpuHandleSanityCheckRegReadError_DISPATCH(pGpu, addr, value)
|
||||
@@ -2970,6 +2973,16 @@ static inline NV_STATUS gpuWriteFunctionConfigRegEx_DISPATCH(struct OBJGPU *pGpu
|
||||
return pGpu->__gpuWriteFunctionConfigRegEx__(pGpu, function, reg, data, pThreadState);
|
||||
}
|
||||
|
||||
NV_STATUS gpuReadVgpuConfigReg_GH100(struct OBJGPU *pGpu, NvU32 index, NvU32 *data);
|
||||
|
||||
static inline NV_STATUS gpuReadVgpuConfigReg_46f6a7(struct OBJGPU *pGpu, NvU32 index, NvU32 *data) {
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static inline NV_STATUS gpuReadVgpuConfigReg_DISPATCH(struct OBJGPU *pGpu, NvU32 index, NvU32 *data) {
|
||||
return pGpu->__gpuReadVgpuConfigReg__(pGpu, index, data);
|
||||
}
|
||||
|
||||
void gpuGetIdInfo_GM107(struct OBJGPU *pGpu);
|
||||
|
||||
void gpuGetIdInfo_GH100(struct OBJGPU *pGpu);
|
||||
|
||||
@@ -425,6 +425,28 @@ static void __nvoc_init_funcTable_KernelMemorySystem_1(KernelMemorySystem *pThis
|
||||
pThis->__kmemsysRemoveAllAtsPeers__ = &kmemsysRemoveAllAtsPeers_GV100;
|
||||
}
|
||||
|
||||
// Hal function -- kmemsysCheckEccCounts
|
||||
if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */
|
||||
{
|
||||
pThis->__kmemsysCheckEccCounts__ = &kmemsysCheckEccCounts_GH100;
|
||||
}
|
||||
// default
|
||||
else
|
||||
{
|
||||
pThis->__kmemsysCheckEccCounts__ = &kmemsysCheckEccCounts_b3696a;
|
||||
}
|
||||
|
||||
// Hal function -- kmemsysClearEccCounts
|
||||
if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */
|
||||
{
|
||||
pThis->__kmemsysClearEccCounts__ = &kmemsysClearEccCounts_GH100;
|
||||
}
|
||||
// default
|
||||
else
|
||||
{
|
||||
pThis->__kmemsysClearEccCounts__ = &kmemsysClearEccCounts_56cd7a;
|
||||
}
|
||||
|
||||
pThis->__nvoc_base_OBJENGSTATE.__engstateConstructEngine__ = &__nvoc_thunk_KernelMemorySystem_engstateConstructEngine;
|
||||
|
||||
pThis->__nvoc_base_OBJENGSTATE.__engstateStateInitLocked__ = &__nvoc_thunk_KernelMemorySystem_engstateStateInitLocked;
|
||||
|
||||
@@ -222,6 +222,8 @@ struct KernelMemorySystem {
|
||||
void (*__kmemsysNumaRemoveAllMemory__)(OBJGPU *, struct KernelMemorySystem *);
|
||||
NV_STATUS (*__kmemsysSetupAllAtsPeers__)(OBJGPU *, struct KernelMemorySystem *);
|
||||
void (*__kmemsysRemoveAllAtsPeers__)(OBJGPU *, struct KernelMemorySystem *);
|
||||
void (*__kmemsysCheckEccCounts__)(OBJGPU *, struct KernelMemorySystem *);
|
||||
NV_STATUS (*__kmemsysClearEccCounts__)(OBJGPU *, struct KernelMemorySystem *);
|
||||
NV_STATUS (*__kmemsysStateLoad__)(POBJGPU, struct KernelMemorySystem *, NvU32);
|
||||
NV_STATUS (*__kmemsysStateUnload__)(POBJGPU, struct KernelMemorySystem *, NvU32);
|
||||
NV_STATUS (*__kmemsysStatePostUnload__)(POBJGPU, struct KernelMemorySystem *, NvU32);
|
||||
@@ -323,6 +325,10 @@ NV_STATUS __nvoc_objCreate_KernelMemorySystem(KernelMemorySystem**, Dynamic*, Nv
|
||||
#define kmemsysSetupAllAtsPeers_HAL(pGpu, pKernelMemorySystem) kmemsysSetupAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
|
||||
#define kmemsysRemoveAllAtsPeers(pGpu, pKernelMemorySystem) kmemsysRemoveAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
|
||||
#define kmemsysRemoveAllAtsPeers_HAL(pGpu, pKernelMemorySystem) kmemsysRemoveAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
|
||||
#define kmemsysCheckEccCounts(pGpu, pKernelMemorySystem) kmemsysCheckEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
|
||||
#define kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem) kmemsysCheckEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
|
||||
#define kmemsysClearEccCounts(pGpu, pKernelMemorySystem) kmemsysClearEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
|
||||
#define kmemsysClearEccCounts_HAL(pGpu, pKernelMemorySystem) kmemsysClearEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
|
||||
#define kmemsysStateLoad(pGpu, pEngstate, arg0) kmemsysStateLoad_DISPATCH(pGpu, pEngstate, arg0)
|
||||
#define kmemsysStateUnload(pGpu, pEngstate, arg0) kmemsysStateUnload_DISPATCH(pGpu, pEngstate, arg0)
|
||||
#define kmemsysStatePostUnload(pGpu, pEngstate, arg0) kmemsysStatePostUnload_DISPATCH(pGpu, pEngstate, arg0)
|
||||
@@ -733,6 +739,26 @@ static inline void kmemsysRemoveAllAtsPeers_DISPATCH(OBJGPU *pGpu, struct Kernel
|
||||
pKernelMemorySystem->__kmemsysRemoveAllAtsPeers__(pGpu, pKernelMemorySystem);
|
||||
}
|
||||
|
||||
void kmemsysCheckEccCounts_GH100(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem);
|
||||
|
||||
static inline void kmemsysCheckEccCounts_b3696a(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void kmemsysCheckEccCounts_DISPATCH(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
|
||||
pKernelMemorySystem->__kmemsysCheckEccCounts__(pGpu, pKernelMemorySystem);
|
||||
}
|
||||
|
||||
NV_STATUS kmemsysClearEccCounts_GH100(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem);
|
||||
|
||||
static inline NV_STATUS kmemsysClearEccCounts_56cd7a(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static inline NV_STATUS kmemsysClearEccCounts_DISPATCH(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
|
||||
return pKernelMemorySystem->__kmemsysClearEccCounts__(pGpu, pKernelMemorySystem);
|
||||
}
|
||||
|
||||
static inline NV_STATUS kmemsysStateLoad_DISPATCH(POBJGPU pGpu, struct KernelMemorySystem *pEngstate, NvU32 arg0) {
|
||||
return pEngstate->__kmemsysStateLoad__(pGpu, pEngstate, arg0);
|
||||
}
|
||||
|
||||
@@ -1007,6 +1007,10 @@ static const CHIPS_RELEASED sChipsReleased[] = {
|
||||
{ 0x27B0, 0x16fa, 0x103c, "NVIDIA RTX 4000 SFF Ada Generation" },
|
||||
{ 0x27B0, 0x16fa, 0x10de, "NVIDIA RTX 4000 SFF Ada Generation" },
|
||||
{ 0x27B0, 0x16fa, 0x17aa, "NVIDIA RTX 4000 SFF Ada Generation" },
|
||||
{ 0x27B1, 0x180c, 0x1028, "NVIDIA RTX 4500 Ada Generation" },
|
||||
{ 0x27B1, 0x180c, 0x103c, "NVIDIA RTX 4500 Ada Generation" },
|
||||
{ 0x27B1, 0x180c, 0x10de, "NVIDIA RTX 4500 Ada Generation" },
|
||||
{ 0x27B1, 0x180c, 0x17aa, "NVIDIA RTX 4500 Ada Generation" },
|
||||
{ 0x27B2, 0x181b, 0x1028, "NVIDIA RTX 4000 Ada Generation" },
|
||||
{ 0x27B2, 0x181b, 0x103c, "NVIDIA RTX 4000 Ada Generation" },
|
||||
{ 0x27B2, 0x181b, 0x10de, "NVIDIA RTX 4000 Ada Generation" },
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "published/hopper/gh100/dev_pmc.h"
|
||||
#include "published/hopper/gh100/dev_xtl_ep_pcfg_gpu.h"
|
||||
#include "published/hopper/gh100/pri_nv_xal_ep.h"
|
||||
#include "published/hopper/gh100/dev_xtl_ep_pri.h"
|
||||
|
||||
#include "ctrl/ctrl2080/ctrl2080mc.h"
|
||||
|
||||
@@ -77,6 +78,28 @@ gpuReadBusConfigReg_GH100
|
||||
return gpuReadBusConfigCycle(pGpu, index, pData);
|
||||
}
|
||||
|
||||
/*!
|
||||
* @brief Read the non-private registers on vGPU through mirror space
|
||||
*
|
||||
* @param[in] pGpu GPU object pointer
|
||||
* @param[in] index Register offset in PCIe config space
|
||||
* @param[out] pData Value of the register
|
||||
*
|
||||
* @returns NV_OK on success
|
||||
*/
|
||||
NV_STATUS
|
||||
gpuReadVgpuConfigReg_GH100
|
||||
(
|
||||
OBJGPU *pGpu,
|
||||
NvU32 index,
|
||||
NvU32 *pData
|
||||
)
|
||||
{
|
||||
*pData = GPU_REG_RD32(pGpu, DEVICE_BASE(NV_EP_PCFGM) + index);
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
/*!
|
||||
* @brief Get GPU ID based on PCIE config reads.
|
||||
* Also determine other properties of the PCIE capabilities.
|
||||
|
||||
@@ -4941,12 +4941,19 @@ gpuReadBusConfigCycle_IMPL
|
||||
NvU8 device = gpuGetDevice(pGpu);
|
||||
NvU8 function = 0;
|
||||
|
||||
if (pGpu->hPci == NULL)
|
||||
if (IS_PASSTHRU(pGpu))
|
||||
{
|
||||
pGpu->hPci = osPciInitHandle(domain, bus, device, function, NULL, NULL);
|
||||
gpuReadVgpuConfigReg_HAL(pGpu, index, pData);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (pGpu->hPci == NULL)
|
||||
{
|
||||
pGpu->hPci = osPciInitHandle(domain, bus, device, function, NULL, NULL);
|
||||
}
|
||||
|
||||
*pData = osPciReadDword(pGpu->hPci, index);
|
||||
*pData = osPciReadDword(pGpu->hPci, index);
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "gpu/conf_compute/conf_compute.h"
|
||||
#include "gpu/fsp/kern_fsp.h"
|
||||
#include "gpu/gsp/kernel_gsp.h"
|
||||
#include "gpu/mem_sys/kern_mem_sys.h"
|
||||
#include "gsp/gspifpub.h"
|
||||
#include "vgpu/rpc.h"
|
||||
|
||||
@@ -523,6 +524,7 @@ kgspBootstrapRiscvOSEarly_GH100
|
||||
{
|
||||
KernelFalcon *pKernelFalcon = staticCast(pKernelGsp, KernelFalcon);
|
||||
KernelFsp *pKernelFsp = GPU_GET_KERNEL_FSP(pGpu);
|
||||
KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
// Only for GSP client builds
|
||||
@@ -532,8 +534,16 @@ kgspBootstrapRiscvOSEarly_GH100
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// Clear ECC errors before attempting to load GSP
|
||||
status = kmemsysClearEccCounts_HAL(pGpu, pKernelMemorySystem);
|
||||
if (status != NV_OK)
|
||||
{
|
||||
NV_PRINTF(LEVEL_ERROR, "Issue clearing ECC counts! Status:0x%x\n", status);
|
||||
}
|
||||
|
||||
// Setup the descriptors that GSP-FMC needs to boot GSP-RM
|
||||
NV_ASSERT_OK_OR_RETURN(kgspSetupGspFmcArgs_HAL(pGpu, pKernelGsp, pGspFw));
|
||||
NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
|
||||
kgspSetupGspFmcArgs_HAL(pGpu, pKernelGsp, pGspFw), exit);
|
||||
|
||||
kgspSetupLibosInitArgs(pGpu, pKernelGsp);
|
||||
|
||||
@@ -562,7 +572,8 @@ kgspBootstrapRiscvOSEarly_GH100
|
||||
{
|
||||
NV_PRINTF(LEVEL_NOTICE, "Starting to boot GSP via FSP.\n");
|
||||
pKernelFsp->setProperty(pKernelFsp, PDB_PROP_KFSP_GSP_MODE_GSPRM, NV_TRUE);
|
||||
NV_ASSERT_OK_OR_RETURN(kfspSendBootCommands_HAL(pGpu, pKernelFsp));
|
||||
NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
|
||||
kfspSendBootCommands_HAL(pGpu, pKernelFsp), exit);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -585,7 +596,7 @@ kgspBootstrapRiscvOSEarly_GH100
|
||||
kfspDumpDebugState_HAL(pGpu, pKernelFsp);
|
||||
}
|
||||
|
||||
return status;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -606,7 +617,7 @@ kgspBootstrapRiscvOSEarly_GH100
|
||||
kflcnRegRead_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX0));
|
||||
NV_PRINTF(LEVEL_ERROR, "NV_PGSP_FALCON_MAILBOX1 = 0x%x\n",
|
||||
kflcnRegRead_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX1));
|
||||
return status;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Start polling for libos logs now that lockdown is released
|
||||
@@ -640,6 +651,11 @@ kgspBootstrapRiscvOSEarly_GH100
|
||||
NV_PRINTF(LEVEL_INFO, "GSP FW RM ready.\n");
|
||||
|
||||
exit:
|
||||
// If GSP fails to boot, check if there's any DED error.
|
||||
if (status != NV_OK)
|
||||
{
|
||||
kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem);
|
||||
}
|
||||
NV_ASSERT(status == NV_OK);
|
||||
|
||||
return status;
|
||||
|
||||
@@ -799,7 +799,7 @@ kgspHealthCheck_TU102
|
||||
objDelete(pReport);
|
||||
}
|
||||
|
||||
return bHealthy;
|
||||
goto exit_health_check;
|
||||
}
|
||||
|
||||
NvU32 mb0 = GPU_REG_RD32(pGpu, NV_PGSP_MAILBOX(0));
|
||||
@@ -845,6 +845,12 @@ kgspHealthCheck_TU102
|
||||
"********************************************************************************\n");
|
||||
}
|
||||
|
||||
exit_health_check:
|
||||
if (!bHealthy)
|
||||
{
|
||||
KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
|
||||
kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem);
|
||||
}
|
||||
return bHealthy;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@@ -23,15 +23,24 @@
|
||||
|
||||
#include "core/core.h"
|
||||
#include "gpu/gpu.h"
|
||||
#include "nvtypes.h"
|
||||
#include "os/os.h"
|
||||
#include "kernel/gpu/mem_sys/kern_mem_sys.h"
|
||||
#include "gpu/mem_mgr/mem_desc.h"
|
||||
#include "gpu/bus/kern_bus.h"
|
||||
#include "kernel/gpu/intr/intr.h"
|
||||
#include "nverror.h"
|
||||
|
||||
#include "published/hopper/gh100/dev_fb.h"
|
||||
#include "published/hopper/gh100/dev_ltc.h"
|
||||
#include "published/hopper/gh100/dev_fbpa.h"
|
||||
#include "published/hopper/gh100/dev_vm.h"
|
||||
#include "published/hopper/gh100/pri_nv_xal_ep.h"
|
||||
#include "published/hopper/gh100/dev_nv_xal_addendum.h"
|
||||
#include "published/hopper/gh100/dev_nv_xpl.h"
|
||||
#include "published/hopper/gh100/dev_xtl_ep_pri.h"
|
||||
#include "published/hopper/gh100/hwproject.h"
|
||||
#include "published/ampere/ga100/dev_fb.h"
|
||||
|
||||
NV_STATUS
|
||||
kmemsysDoCacheOp_GH100
|
||||
@@ -566,3 +575,168 @@ kmemsysSwizzIdToVmmuSegmentsRange_GH100
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
/*!
|
||||
* Utility function used to read registers and ignore PRI errors
|
||||
*/
|
||||
static NvU32
|
||||
_kmemsysReadRegAndMaskPriError
|
||||
(
|
||||
OBJGPU *pGpu,
|
||||
NvU32 regAddr
|
||||
)
|
||||
{
|
||||
NvU32 regVal;
|
||||
|
||||
regVal = osGpuReadReg032(pGpu, regAddr);
|
||||
if ((regVal & GPU_READ_PRI_ERROR_MASK) == GPU_READ_PRI_ERROR_CODE)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
return regVal;
|
||||
}
|
||||
/*
|
||||
* @brief Function that checks if ECC error occurred by reading various count
|
||||
* registers/interrupt registers. This function is not floorsweeping-aware so
|
||||
* PRI errors are ignored
|
||||
*/
|
||||
void
|
||||
kmemsysCheckEccCounts_GH100
|
||||
(
|
||||
OBJGPU *pGpu,
|
||||
KernelMemorySystem *pKernelMemorySystem
|
||||
)
|
||||
{
|
||||
NvU32 dramCount = 0;
|
||||
NvU32 mmuCount = 0;
|
||||
NvU32 ltcCount = 0;
|
||||
NvU32 pcieCount = 0;
|
||||
NvU32 regVal;
|
||||
for (NvU32 i = 0; i < NV_SCAL_LITTER_NUM_FBPAS; i++)
|
||||
{
|
||||
for (NvU32 j = 0; j < NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1; j++)
|
||||
{
|
||||
// DRAM count read
|
||||
dramCount += _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_FBPA_0_ECC_DED_COUNT(j) + (i * NV_FBPA_PRI_STRIDE));
|
||||
|
||||
// LTC count read
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT +
|
||||
(i * NV_LTC_PRI_STRIDE) + (j * NV_LTS_PRI_STRIDE));
|
||||
ltcCount += DRF_VAL(_PLTCG_LTC0_LTS0, _L2_CACHE_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
|
||||
}
|
||||
}
|
||||
|
||||
// L2TLB
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT);
|
||||
mmuCount += DRF_VAL(_PFB_PRI_MMU, _L2TLB_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
|
||||
|
||||
// HUBTLB
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT);
|
||||
mmuCount += DRF_VAL(_PFB_PRI_MMU, _HUBTLB_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
|
||||
|
||||
// FILLUNIT
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT);
|
||||
mmuCount += DRF_VAL(_PFB_PRI_MMU, _FILLUNIT_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
|
||||
|
||||
// PCIE RBUF
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_COUNT_RBUF);
|
||||
pcieCount += DRF_VAL(_XPL_DL, _ERR_COUNT_RBUF, _UNCORR_ERR, regVal);
|
||||
|
||||
// PCIE SEQ_LUT
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_COUNT_SEQ_LUT);
|
||||
pcieCount += DRF_VAL(_XPL_DL, _ERR_COUNT_SEQ_LUT, _UNCORR_ERR, regVal);
|
||||
|
||||
// PCIE RE ORDER
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT);
|
||||
pcieCount += DRF_VAL(_XAL_EP, _REORDER_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
|
||||
|
||||
// PCIE P2PREQ
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT);
|
||||
pcieCount += DRF_VAL(_XAL_EP, _P2PREQ_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
|
||||
|
||||
// PCIE XTL
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_DED_ERROR_STATUS);
|
||||
if (regVal != 0)
|
||||
{
|
||||
pcieCount += 1;
|
||||
}
|
||||
|
||||
// PCIE XTL
|
||||
regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS);
|
||||
if (regVal != 0)
|
||||
{
|
||||
pcieCount += 1;
|
||||
}
|
||||
|
||||
// If counts > 0 or if poison interrupt pending, ECC error has occurred.
|
||||
if (((dramCount + ltcCount + mmuCount + pcieCount) != 0) ||
|
||||
intrIsVectorPending_HAL(pGpu, GPU_GET_INTR(pGpu), NV_PFB_FBHUB_POISON_INTR_VECTOR_HW_INIT, NULL))
|
||||
{
|
||||
nvErrorLog_va((void *)pGpu, UNRECOVERABLE_ECC_ERROR_ESCAPE,
|
||||
"An uncorrectable ECC error detected "
|
||||
"(possible firmware handling failure) "
|
||||
"DRAM:%d, LTC:%d, MMU:%d, PCIE:%d", dramCount, ltcCount, mmuCount, pcieCount);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @brief Function that clears ECC error count registers.
|
||||
*/
|
||||
NV_STATUS
|
||||
kmemsysClearEccCounts_GH100
|
||||
(
|
||||
OBJGPU *pGpu,
|
||||
KernelMemorySystem *pKernelMemorySystem
|
||||
)
|
||||
{
|
||||
NvU32 regVal = 0;
|
||||
RMTIMEOUT timeout;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
gpuClearFbhubPoisonIntrForBug2924523_HAL(pGpu);
|
||||
|
||||
for (NvU32 i = 0; i < NV_SCAL_LITTER_NUM_FBPAS; i++)
|
||||
{
|
||||
for (NvU32 j = 0; j < NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1; j++)
|
||||
{
|
||||
osGpuWriteReg032(pGpu, NV_PFB_FBPA_0_ECC_DED_COUNT(j) + (i * NV_FBPA_PRI_STRIDE), 0);
|
||||
osGpuWriteReg032(pGpu, NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT + (i * NV_LTC_PRI_STRIDE) + (j * NV_LTS_PRI_STRIDE), 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Reset MMU counts
|
||||
osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT, 0);
|
||||
osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT, 0);
|
||||
osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT, 0);
|
||||
|
||||
// Reset XAL-EP counts
|
||||
osGpuWriteReg032(pGpu, NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT, 0);
|
||||
osGpuWriteReg032(pGpu, NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT, 0);
|
||||
|
||||
// Reset XTL-EP status registers
|
||||
osGpuWriteReg032(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_DED_ERROR_STATUS, ~0);
|
||||
osGpuWriteReg032(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS, ~0);
|
||||
|
||||
// Reset XPL-EP error counters
|
||||
regVal = DRF_DEF(_XPL, _DL_ERR_RESET, _RBUF_UNCORR_ERR_COUNT, _PENDING) |
|
||||
DRF_DEF(_XPL, _DL_ERR_RESET, _SEQ_LUT_UNCORR_ERR_COUNT, _PENDING);
|
||||
osGpuWriteReg032(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_RESET, regVal);
|
||||
|
||||
// Wait for the error counter reset to complete
|
||||
gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
|
||||
for (;;)
|
||||
{
|
||||
status = gpuCheckTimeout(pGpu, &timeout);
|
||||
|
||||
regVal = osGpuReadReg032(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_RESET);
|
||||
|
||||
if (FLD_TEST_DRF(_XPL, _DL_ERR_RESET, _RBUF_UNCORR_ERR_COUNT, _DONE, regVal) &&
|
||||
FLD_TEST_DRF(_XPL, _DL_ERR_RESET, _SEQ_LUT_UNCORR_ERR_COUNT, _DONE, regVal))
|
||||
break;
|
||||
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
@@ -1013,7 +1013,7 @@ _rmapiControlWithSecInfoTlsIRQL
|
||||
NV_STATUS status;
|
||||
THREAD_STATE_NODE threadState;
|
||||
|
||||
NvU8 stackAllocator[TLS_ISR_ALLOCATOR_SIZE];
|
||||
NvU8 stackAllocator[2*TLS_ISR_ALLOCATOR_SIZE];
|
||||
PORT_MEM_ALLOCATOR* pIsrAllocator = portMemAllocatorCreateOnExistingBlock(stackAllocator, sizeof(stackAllocator));
|
||||
tlsIsrInit(pIsrAllocator);
|
||||
|
||||
|
||||
@@ -1444,6 +1444,14 @@ _portMemAllocatorCreateOnExistingBlock
|
||||
pAllocator->pTracking = NULL; // No tracking for this allocator
|
||||
pAllocator->pImpl = (PORT_MEM_ALLOCATOR_IMPL*)(pAllocator + 1);
|
||||
|
||||
|
||||
//
|
||||
// PORT_MEM_BITVECTOR (pAllocator->pImpl) and PORT_MEM_ALLOCATOR_TRACKING (pAllocator->pImpl->tracking)
|
||||
// are mutually exclusively used.
|
||||
// When pAllocator->pTracking = NULL the data in pAllocator->pImpl->tracking is not used and instead
|
||||
// pBitVector uses the same meory location.
|
||||
// When pAllocator->pImpl->tracking there is no usage of PORT_MEM_BITVECTOR
|
||||
//
|
||||
pBitVector = (PORT_MEM_BITVECTOR*)(pAllocator->pImpl);
|
||||
pBitVector->pSpinlock = pSpinlock;
|
||||
|
||||
@@ -1544,6 +1552,10 @@ _portMemAllocatorAllocExistingWrapper
|
||||
{
|
||||
portSyncSpinlockRelease(pSpinlock);
|
||||
}
|
||||
if (pMem == NULL)
|
||||
{
|
||||
PORT_MEM_PRINT_ERROR("Memory allocation failed.\n");
|
||||
}
|
||||
return pMem;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
NVIDIA_VERSION = 535.104.05
|
||||
NVIDIA_VERSION = 535.104.12
|
||||
|
||||
# This file.
|
||||
VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))
|
||||
|
||||
Reference in New Issue
Block a user