515.76

2026-04-30 19:21:15 +00:00 · 2022-09-20 13:54:59 -07:00
parent 91e02299dc
commit 70259dbe7a
47 changed files with 805 additions and 350 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,13 @@

 ## Release 515 Entries

+### [515.76] 2022-09-20
+
+#### Fixed
+
+- Improved compatibility with new Linux kernel releases
+- Fixed possible excessive GPU power draw on an idle X11 or Wayland desktop when driving high resolutions or refresh rates
+
 ### [515.65.01] 2022-08-02

 #### Fixed
--- a/README.md
+++ b/README.md
@@ -643,6 +643,8 @@ Subsystem Device ID.
 | NVIDIA A100-PG509-200                           | 20B0 10DE 1450 |
 | NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1463 |
 | NVIDIA A100-SXM4-80GB                           | 20B2 10DE 147F |
+| NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1622 |
+| NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1623 |
 | NVIDIA PG506-242                                | 20B3 10DE 14A7 |
 | NVIDIA PG506-243                                | 20B3 10DE 14A8 |
 | NVIDIA A100 80GB PCIe                           | 20B5 10DE 1533 |
@@ -743,6 +745,7 @@ Subsystem Device ID.
 | NVIDIA GeForce RTX 3050                         | 2507           |
 | NVIDIA GeForce RTX 3050 OEM                     | 2508           |
 | NVIDIA GeForce RTX 3060 Laptop GPU              | 2520           |
+| NVIDIA GeForce RTX 3060 Laptop GPU              | 2521           |
 | NVIDIA GeForce RTX 3050 Ti Laptop GPU           | 2523           |
 | NVIDIA RTX A2000                                | 2531 1028 151D |
 | NVIDIA RTX A2000                                | 2531 103C 151D |
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -203,9 +203,108 @@ $(obj)/conftest/patches.h: $(NV_CONFTEST_SCRIPT)
 	@mkdir -p $(obj)/conftest
 	@$(NV_CONFTEST_CMD) patch_check > $@

-$(obj)/conftest/headers.h: $(NV_CONFTEST_SCRIPT)
-	@mkdir -p $(obj)/conftest
-	@$(NV_CONFTEST_CMD) test_kernel_headers '$(NV_CONFTEST_CFLAGS)' > $@
+
+# Each of these headers is checked for presence with a test #include; a
+# corresponding #define will be generated in conftest/headers.h.
+NV_HEADER_PRESENCE_TESTS = \
+ asm/system.h \
+ drm/drmP.h \
+ drm/drm_auth.h \
+ drm/drm_gem.h \
+ drm/drm_crtc.h \
+ drm/drm_atomic.h \
+ drm/drm_atomic_helper.h \
+ drm/drm_encoder.h \
+ drm/drm_atomic_uapi.h \
+ drm/drm_drv.h \
+ drm/drm_framebuffer.h \
+ drm/drm_connector.h \
+ drm/drm_probe_helper.h \
+ drm/drm_blend.h \
+ drm/drm_fourcc.h \
+ drm/drm_prime.h \
+ drm/drm_plane.h \
+ drm/drm_vblank.h \
+ drm/drm_file.h \
+ drm/drm_ioctl.h \
+ drm/drm_device.h \
+ drm/drm_mode_config.h \
+ dt-bindings/interconnect/tegra_icc_id.h \
+ generated/autoconf.h \
+ generated/compile.h \
+ generated/utsrelease.h \
+ linux/efi.h \
+ linux/kconfig.h \
+ linux/platform/tegra/mc_utils.h \
+ linux/semaphore.h \
+ linux/printk.h \
+ linux/ratelimit.h \
+ linux/prio_tree.h \
+ linux/log2.h \
+ linux/of.h \
+ linux/bug.h \
+ linux/sched/signal.h \
+ linux/sched/task.h \
+ linux/sched/task_stack.h \
+ xen/ioemu.h \
+ linux/fence.h \
+ linux/dma-resv.h \
+ soc/tegra/chip-id.h \
+ soc/tegra/fuse.h \
+ soc/tegra/tegra_bpmp.h \
+ video/nv_internal.h \
+ linux/platform/tegra/dce/dce-client-ipc.h \
+ linux/nvhost.h \
+ linux/nvhost_t194.h \
+ asm/book3s/64/hash-64k.h \
+ asm/set_memory.h \
+ asm/prom.h \
+ asm/powernv.h \
+ linux/atomic.h \
+ asm/barrier.h \
+ asm/opal-api.h \
+ sound/hdaudio.h \
+ asm/pgtable_types.h \
+ linux/stringhash.h \
+ linux/dma-map-ops.h \
+ rdma/peer_mem.h \
+ sound/hda_codec.h \
+ linux/dma-buf.h \
+ linux/time.h \
+ linux/platform_device.h \
+ linux/mutex.h \
+ linux/reset.h \
+ linux/of_platform.h \
+ linux/of_device.h \
+ linux/of_gpio.h \
+ linux/gpio.h \
+ linux/gpio/consumer.h \
+ linux/interconnect.h \
+ linux/pm_runtime.h \
+ linux/clk.h \
+ linux/clk-provider.h \
+ linux/ioasid.h \
+ linux/stdarg.h \
+ linux/iosys-map.h \
+ asm/coco.h
+
+# Filename to store the define for the header in $(1); this is only consumed by
+# the rule below that concatenates all of these together.
+NV_HEADER_PRESENCE_PART = $(addprefix $(obj)/conftest/header_presence/,$(addsuffix .part,$(1)))
+
+# Define a rule to check the header $(1).
+define NV_HEADER_PRESENCE_CHECK
+ $$(call NV_HEADER_PRESENCE_PART,$(1)): $$(NV_CONFTEST_SCRIPT) $(obj)/conftest/uts_release
+	@mkdir -p $$(dir $$@)
+	@$$(NV_CONFTEST_CMD) test_kernel_header '$$(NV_CONFTEST_CFLAGS)' '$(1)' > $$@
+endef
+
+# Evaluate the rule above for each header in the list.
+$(foreach header,$(NV_HEADER_PRESENCE_TESTS),$(eval $(call NV_HEADER_PRESENCE_CHECK,$(header))))
+
+# Concatenate all of the parts into headers.h.
+$(obj)/conftest/headers.h: $(call NV_HEADER_PRESENCE_PART,$(NV_HEADER_PRESENCE_TESTS))
+	@cat $^ > $@

 clean-dirs := $(obj)/conftest

--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@@ -227,6 +227,7 @@ static inline uid_t __kuid_val(uid_t uid)
 #endif

 #include <linux/fb.h>               /* fb_info struct                   */
+#include <linux/screen_info.h>      /* screen_info                      */

 #if !defined(CONFIG_PCI)
 #warning "Attempting to build driver for a platform with no PCI support!"
--- a/kernel-open/common/inc/nv-pgprot.h
+++ b/kernel-open/common/inc/nv-pgprot.h
@@ -78,13 +78,8 @@ static inline pgprot_t pgprot_modify_writecombine(pgprot_t old_prot)

 #define NV_PGPROT_UNCACHED_DEVICE(old_prot)     pgprot_noncached(old_prot)
 #if defined(NVCPU_AARCH64)
-#if defined(NV_MT_DEVICE_GRE_PRESENT)
-#define NV_PROT_WRITE_COMBINED_DEVICE   (PROT_DEFAULT | PTE_PXN | PTE_UXN |   \
-                                         PTE_ATTRINDX(MT_DEVICE_GRE))
-#else
 #define NV_PROT_WRITE_COMBINED_DEVICE   (PROT_DEFAULT | PTE_PXN | PTE_UXN |   \
                                         PTE_ATTRINDX(MT_DEVICE_nGnRE))
-#endif
 #define NV_PGPROT_WRITE_COMBINED_DEVICE(old_prot)                             \
    __pgprot_modify(old_prot, PTE_ATTRINDX_MASK, NV_PROT_WRITE_COMBINED_DEVICE)
 #define NV_PGPROT_WRITE_COMBINED(old_prot)      NV_PGPROT_UNCACHED(old_prot)
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@@ -624,27 +624,45 @@ typedef enum
 #define NV_GET_NV_STATE(pGpu) \
    (nv_state_t *)((pGpu) ? (pGpu)->pOsGpuInfo : NULL)

-#define IS_REG_OFFSET(nv, offset, length)                                       \
-    (((offset) >= (nv)->regs->cpu_address) &&                                   \
-    (((offset) + ((length)-1)) <=                                               \
-        (nv)->regs->cpu_address + ((nv)->regs->size-1)))
+static inline NvBool IS_REG_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((offset >= nv->regs->cpu_address) &&

-#define IS_FB_OFFSET(nv, offset, length)                                        \
-    (((nv)->fb) && ((offset) >= (nv)->fb->cpu_address) &&                       \
-    (((offset) + ((length)-1)) <= (nv)->fb->cpu_address + ((nv)->fb->size-1)))

-#define IS_UD_OFFSET(nv, offset, length)                                        \
-    (((nv)->ud.cpu_address != 0) && ((nv)->ud.size != 0) &&                     \
-    ((offset) >= (nv)->ud.cpu_address) &&                                       \
-    (((offset) + ((length)-1)) <= (nv)->ud.cpu_address + ((nv)->ud.size-1)))

-#define IS_IMEM_OFFSET(nv, offset, length)                                      \
-    (((nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&                    \
-     ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&                           \
-     ((offset) >= (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&             \
-     (((offset) + ((length) - 1)) <=                                            \
-        (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +                         \
-            ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size - 1)))
+            ((offset + (length - 1)) <= (nv->regs->cpu_address + (nv->regs->size - 1))));
+}
+
+static inline NvBool IS_FB_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return  ((nv->fb) && (offset >= nv->fb->cpu_address) &&
+
+
+
+             ((offset + (length - 1)) <= (nv->fb->cpu_address + (nv->fb->size - 1))));
+}
+
+static inline NvBool IS_UD_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->ud.cpu_address != 0) && (nv->ud.size != 0) &&
+            (offset >= nv->ud.cpu_address) &&
+
+
+
+            ((offset + (length - 1)) <= (nv->ud.cpu_address + (nv->ud.size - 1))));
+}
+
+static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&
+            (nv->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&
+            (offset >= nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&
+
+
+
+            ((offset + (length - 1)) <= (nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +
+                                         (nv->bars[NV_GPU_BAR_INDEX_IMEM].size - 1))));
+}

 #define NV_RM_MAX_MSIX_LINES  8

--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -55,9 +55,13 @@ append_conftest() {
    done
 }

-translate_and_preprocess_header_files() {
-    # Inputs:
-    #   $1: list of relative file paths
+test_header_presence() {
+    #
+    # Determine if the given header file (which may or may not be
+    # present) is provided by the target kernel.
+    #
+    # Input:
+    #   $1: relative file path
    #
    # This routine creates an upper case, underscore version of each of the
    # relative file paths, and uses that as the token to either define or
@@ -73,7 +77,7 @@ translate_and_preprocess_header_files() {
    # strings, without special handling of the beginning or the end of the line.
    TEST_CFLAGS=`echo "-E -M $CFLAGS " | sed -e 's/\( -M[DG]\)* / /g'`

-    for file in "$@"; do
+    file="$1"
    file_define=NV_`echo $file | tr '/.' '_' | tr '-' '_' | tr 'a-z' 'A-Z'`_PRESENT

    CODE="#include <$file>"
@@ -92,96 +96,6 @@ translate_and_preprocess_header_files() {
            echo "#define $file_define"
        fi
    fi
-    done
-}
-
-test_headers() {
-    #
-    # Determine which header files (of a set that may or may not be
-    # present) are provided by the target kernel.
-    #
-    FILES="asm/system.h"
-    FILES="$FILES drm/drmP.h"
-    FILES="$FILES drm/drm_auth.h"
-    FILES="$FILES drm/drm_gem.h"
-    FILES="$FILES drm/drm_crtc.h"
-    FILES="$FILES drm/drm_atomic.h"
-    FILES="$FILES drm/drm_atomic_helper.h"
-    FILES="$FILES drm/drm_encoder.h"
-    FILES="$FILES drm/drm_atomic_uapi.h"
-    FILES="$FILES drm/drm_drv.h"
-    FILES="$FILES drm/drm_framebuffer.h"
-    FILES="$FILES drm/drm_connector.h"
-    FILES="$FILES drm/drm_probe_helper.h"
-    FILES="$FILES drm/drm_blend.h"
-    FILES="$FILES drm/drm_fourcc.h"
-    FILES="$FILES drm/drm_prime.h"
-    FILES="$FILES drm/drm_plane.h"
-    FILES="$FILES drm/drm_vblank.h"
-    FILES="$FILES drm/drm_file.h"
-    FILES="$FILES drm/drm_ioctl.h"
-    FILES="$FILES drm/drm_device.h"
-    FILES="$FILES drm/drm_mode_config.h"
-    FILES="$FILES dt-bindings/interconnect/tegra_icc_id.h"
-    FILES="$FILES generated/autoconf.h"
-    FILES="$FILES generated/compile.h"
-    FILES="$FILES generated/utsrelease.h"
-    FILES="$FILES linux/efi.h"
-    FILES="$FILES linux/kconfig.h"
-    FILES="$FILES linux/platform/tegra/mc_utils.h"
-    FILES="$FILES linux/semaphore.h"
-    FILES="$FILES linux/printk.h"
-    FILES="$FILES linux/ratelimit.h"
-    FILES="$FILES linux/prio_tree.h"
-    FILES="$FILES linux/log2.h"
-    FILES="$FILES linux/of.h"
-    FILES="$FILES linux/bug.h"
-    FILES="$FILES linux/sched/signal.h"
-    FILES="$FILES linux/sched/task.h"
-    FILES="$FILES linux/sched/task_stack.h"
-    FILES="$FILES xen/ioemu.h"
-    FILES="$FILES linux/fence.h"
-    FILES="$FILES linux/dma-resv.h"
-    FILES="$FILES soc/tegra/chip-id.h"
-    FILES="$FILES soc/tegra/fuse.h"
-    FILES="$FILES soc/tegra/tegra_bpmp.h"
-    FILES="$FILES video/nv_internal.h"
-    FILES="$FILES linux/platform/tegra/dce/dce-client-ipc.h"
-    FILES="$FILES linux/nvhost.h"
-    FILES="$FILES linux/nvhost_t194.h"
-    FILES="$FILES asm/book3s/64/hash-64k.h"
-    FILES="$FILES asm/set_memory.h"
-    FILES="$FILES asm/prom.h"
-    FILES="$FILES asm/powernv.h"
-    FILES="$FILES linux/atomic.h"
-    FILES="$FILES asm/barrier.h"
-    FILES="$FILES asm/opal-api.h"
-    FILES="$FILES sound/hdaudio.h"
-    FILES="$FILES asm/pgtable_types.h"
-    FILES="$FILES linux/stringhash.h"
-    FILES="$FILES linux/dma-map-ops.h"
-    FILES="$FILES rdma/peer_mem.h"
-    FILES="$FILES sound/hda_codec.h"
-    FILES="$FILES linux/dma-buf.h"
-    FILES="$FILES linux/time.h"
-    FILES="$FILES linux/platform_device.h"
-    FILES="$FILES linux/mutex.h"
-    FILES="$FILES linux/reset.h"
-    FILES="$FILES linux/of_platform.h"
-    FILES="$FILES linux/of_device.h"
-    FILES="$FILES linux/of_gpio.h"
-    FILES="$FILES linux/gpio.h"
-    FILES="$FILES linux/gpio/consumer.h"
-    FILES="$FILES linux/interconnect.h"
-    FILES="$FILES linux/pm_runtime.h"
-    FILES="$FILES linux/clk.h"
-    FILES="$FILES linux/clk-provider.h"
-    FILES="$FILES linux/ioasid.h"
-    FILES="$FILES linux/stdarg.h"
-    FILES="$FILES linux/iosys-map.h"
-    FILES="$FILES asm/coco.h"
-
-    translate_and_preprocess_header_files $FILES
 }

 build_cflags() {
@@ -2420,23 +2334,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_PCI_DEV_HAS_ATS_ENABLED" "" "types"
        ;;

-        mt_device_gre)
-            #
-            # Determine if MT_DEVICE_GRE flag is present.
-            #
-            # MT_DEVICE_GRE flag is removed by commit 58cc6b72a21274
-            # ("arm64: mm: Remove unused support for Device-GRE memory type") in v5.14-rc1
-            # (2021-06-01).
-            #
-            CODE="
-            #include <asm/memory.h>
-            unsigned int conftest_mt_device_gre(void) {
-                return MT_DEVICE_GRE;
-            }"
-
-            compile_check_conftest "$CODE" "NV_MT_DEVICE_GRE_PRESENT" "" "types"
-        ;;
-
        get_user_pages)
            #
            # Conftest for get_user_pages()
@@ -5366,6 +5263,23 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_GET_TASK_IOPRIO_PRESENT" "" "functions"
        ;;

+        num_registered_fb)
+            #
+            # Determine if 'num_registered_fb' variable is present.
+            #
+            # 'num_registered_fb' was removed by commit 5727dcfd8486
+            # ("fbdev: Make registered_fb[] private to fbmem.c) for
+            # v5.20 linux-next (2022-07-27).
+            #
+            CODE="
+            #include <linux/fb.h>
+            int conftest_num_registered_fb(void) {
+                return num_registered_fb;
+            }"
+
+            compile_check_conftest "$CODE" "NV_NUM_REGISTERED_FB_PRESENT" "" "types"
+        ;;
+
        # When adding a new conftest entry, please use the correct format for
        # specifying the relevant upstream Linux kernel commit.
        #
@@ -5764,14 +5678,14 @@ case "$5" in
    ;;


-    test_kernel_headers)
+    test_kernel_header)
        #
-        # Check for the availability of certain kernel headers
+        # Check for the availability of the given kernel header
        #

        CFLAGS=$6

-        test_headers
+        test_header_presence "${7}"

        for file in conftest*.d; do
            rm -f $file > /dev/null 2>&1
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.c
@@ -41,6 +41,19 @@
 #include <drm/drm_atomic_uapi.h>
 #endif

+/*
+ * The inclusion of drm_framebuffer.h was removed from drm_crtc.h by commit
+ * 720cf96d8fecde29b72e1101f8a567a0ce99594f ("drm: Drop drm_framebuffer.h from
+ * drm_crtc.h") in linux-next, expected in v5.19-rc7.
+ *
+ * We only need drm_framebuffer.h for drm_framebuffer_put(), and it is always
+ * present (v4.9+) when drm_framebuffer_{put,get}() is present (v4.12+), so it
+ * is safe to unconditionally include it when drm_framebuffer_get() is present.
+ */
+#if defined(NV_DRM_FRAMEBUFFER_GET_PRESENT)
+#include <drm/drm_framebuffer.h>
+#endif
+
 static void __nv_drm_framebuffer_put(struct drm_framebuffer *fb)
 {
 #if defined(NV_DRM_FRAMEBUFFER_GET_PRESENT)
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@@ -59,6 +59,9 @@

 #define NVKMS_LOG_PREFIX "nvidia-modeset: "

+static bool output_rounding_fix = false;
+module_param_named(output_rounding_fix, output_rounding_fix, bool, 0400);
+
 /* These parameters are used for fault injection tests.  Normally the defaults
 * should be used. */
 MODULE_PARM_DESC(fail_malloc, "Fail the Nth call to nvkms_alloc");
@@ -71,6 +74,10 @@ module_param_named(malloc_verbose, malloc_verbose, bool, 0400);

 static atomic_t nvkms_alloc_called_count;

+NvBool nvkms_output_rounding_fix(void)
+{
+    return output_rounding_fix;
+}

 #define NVKMS_SYNCPT_STUBS_NEEDED

--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@@ -110,6 +110,7 @@ typedef struct {
    } set_maxval;
 } NvKmsSyncPtOpParams;

+NvBool nvkms_output_rounding_fix(void);

 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -35,10 +35,6 @@
 #include "nv_uvm_interface.h"
 #include "clb06f.h"

-#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT 1024
-#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MIN 32
-#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX (1024 * 1024)
-
 static unsigned uvm_channel_num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT;

 #define UVM_CHANNEL_GPFIFO_LOC_DEFAULT "auto"
@@ -86,6 +82,12 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,

    uvm_spin_lock(&channel->pool->lock);

+    // Completed value should never exceed the queued value
+    UVM_ASSERT_MSG_RELEASE(completed_value <= channel->tracking_sem.queued_value,
+                           "GPU %s channel %s unexpected completed_value 0x%llx > queued_value 0x%llx\n",
+                           channel->pool->manager->gpu->parent->name, channel->name, completed_value,
+                           channel->tracking_sem.queued_value);
+
    cpu_put = channel->cpu_put;
    gpu_get = channel->gpu_get;

@@ -395,6 +397,14 @@ static void uvm_channel_semaphore_release(uvm_push_t *push, NvU64 semaphore_va,
 {
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

+    // We used to skip the membar or use membar GPU for the semaphore release
+    // for a few pushes, but that doesn't provide sufficient ordering guarantees
+    // in some cases (e.g. ga100 with an LCE with PCEs from both HSHUBs) for the
+    // semaphore writes. To be safe, just always uses a membar sys for now.
+    // TODO bug 3770539: Optimize membars used by end of push semaphore releases
+    (void)uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+    (void)uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+
    if (uvm_channel_is_ce(push->channel))
        gpu->parent->ce_hal->semaphore_release(push, semaphore_va, new_payload);

@@ -1562,6 +1572,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
    UVM_SEQ_OR_DBG_PRINT(s, "get                %u\n", channel->gpu_get);
    UVM_SEQ_OR_DBG_PRINT(s, "put                %u\n", channel->cpu_put);
    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA   0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel));
+    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload);

    uvm_spin_unlock(&channel->pool->lock);
 }
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@@ -46,6 +46,21 @@
 // wait for a GPFIFO entry to free up.
 //

+#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT 1024
+#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MIN 32
+#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX (1024 * 1024)
+
+// Semaphore payloads cannot advance too much between calls to
+// uvm_gpu_tracking_semaphore_update_completed_value(). In practice the jumps
+// are bound by gpfifo sizing as we have to update the completed value to
+// reclaim gpfifo entries. Set a limit based on the max gpfifo entries we could
+// ever see.
+//
+// Logically this define belongs to uvm_gpu_semaphore.h but it depends on the
+// channel GPFIFO sizing defined here so it's easiest to just have it here as
+// uvm_channel.h includes uvm_gpu_semaphore.h.
+#define UVM_GPU_SEMAPHORE_MAX_JUMP (2 * UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX)
+
 // Channel types
 typedef enum
 {
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@@ -151,6 +151,37 @@ done:
    return status;
 }

+static NV_STATUS test_unexpected_completed_values(uvm_va_space_t *va_space)
+{
+    NV_STATUS status;
+    uvm_gpu_t *gpu;
+
+    for_each_va_space_gpu(gpu, va_space) {
+        uvm_channel_t *channel;
+        NvU64 completed_value;
+
+        // The GPU channel manager is destroyed and then re-created after
+        // the test, so this test requires exclusive access to the GPU.
+        TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1);
+
+        channel = &gpu->channel_manager->channel_pools[0].channels[0];
+        completed_value = uvm_channel_update_completed_value(channel);
+        uvm_gpu_semaphore_set_payload(&channel->tracking_sem.semaphore, (NvU32)completed_value + 1);
+
+        TEST_CHECK_RET(uvm_global_get_status() == NV_OK);
+        uvm_channel_update_progress_all(channel);
+        TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_INVALID_STATE);
+
+        uvm_channel_manager_destroy(gpu->channel_manager);
+        // Destruction will hit the error again, so clear one more time.
+        uvm_global_reset_fatal_error();
+
+        TEST_NV_CHECK_RET(uvm_channel_manager_create(gpu, &gpu->channel_manager));
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS uvm_test_rc_for_gpu(uvm_gpu_t *gpu)
 {
    uvm_push_t push;
@@ -712,6 +743,14 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct



+    g_uvm_global.disable_fatal_error_assert = true;
+    uvm_release_asserts_set_global_error_for_tests = true;
+    status = test_unexpected_completed_values(va_space);
+    uvm_release_asserts_set_global_error_for_tests = false;
+    g_uvm_global.disable_fatal_error_assert = false;
+    if (status != NV_OK)
+        goto done;
+
    if (g_uvm_global.num_simulated_devices == 0) {
        status = test_rc(va_space);
        if (status != NV_OK)
--- a/kernel-open/nvidia-uvm/uvm_common.c
+++ b/kernel-open/nvidia-uvm/uvm_common.c
@@ -48,6 +48,33 @@ module_param(uvm_enable_builtin_tests, int, S_IRUGO);
 MODULE_PARM_DESC(uvm_enable_builtin_tests,
                 "Enable the UVM built-in tests. (This is a security risk)");

+// Default to release asserts being enabled.
+int uvm_release_asserts __read_mostly = 1;
+
+// Make the module param writable so that release asserts can be enabled or
+// disabled at any time by modifying the module parameter.
+module_param(uvm_release_asserts, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(uvm_release_asserts, "Enable uvm asserts included in release builds.");
+
+// Default to failed release asserts not dumping stack.
+int uvm_release_asserts_dump_stack __read_mostly = 0;
+
+// Make the module param writable so that dumping the stack can be enabled and
+// disabled at any time by modifying the module parameter.
+module_param(uvm_release_asserts_dump_stack, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(uvm_release_asserts_dump_stack, "dump_stack() on failed UVM release asserts.");
+
+// Default to failed release asserts not setting the global UVM error.
+int uvm_release_asserts_set_global_error __read_mostly = 0;
+
+// Make the module param writable so that setting the global fatal error can be
+// enabled and disabled at any time by modifying the module parameter.
+module_param(uvm_release_asserts_set_global_error, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(uvm_release_asserts_set_global_error, "Set UVM global fatal error on failed release asserts.");
+
+// A separate flag to enable setting global error, to be used by tests only.
+bool uvm_release_asserts_set_global_error_for_tests __read_mostly = false;
+
 //
 // Convert kernel errno codes to corresponding NV_STATUS
 //
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@@ -80,6 +80,9 @@ bool uvm_debug_prints_enabled(void);
 #define UVM_ASSERT_PRINT(fmt, ...) \
    UVM_PRINT_FUNC_PREFIX(printk, KERN_ERR NVIDIA_UVM_PRETTY_PRINTING_PREFIX, " " fmt, ##__VA_ARGS__)

+#define UVM_ASSERT_PRINT_RL(fmt, ...) \
+    UVM_PRINT_FUNC_PREFIX(printk_ratelimited, KERN_ERR NVIDIA_UVM_PRETTY_PRINTING_PREFIX, " " fmt, ##__VA_ARGS__)
+
 #define UVM_ERR_PRINT(fmt, ...) \
    UVM_PRINT_FUNC_PREFIX_CHECK(printk, KERN_ERR NVIDIA_UVM_PRETTY_PRINTING_PREFIX, " " fmt, ##__VA_ARGS__)

@@ -146,9 +149,7 @@ void on_uvm_test_fail(void);
 // Unlike on_uvm_test_fail it provides 'panic' coverity semantics
 void on_uvm_assert(void);

-// UVM_ASSERT_RELEASE and UVM_ASSERT_MSG_RELEASE are always enabled, even on
-// release builds.
-#define _UVM_ASSERT_MSG_RELEASE(expr, cond, fmt, ...)                                           \
+#define _UVM_ASSERT_MSG(expr, cond, fmt, ...)                                                   \
    do {                                                                                        \
        if (unlikely(!(expr))) {                                                                \
            UVM_ASSERT_PRINT("Assert failed, condition %s not true" fmt, cond, ##__VA_ARGS__);  \
@@ -157,9 +158,6 @@ void on_uvm_assert(void);
        }                                                                                       \
    } while (0)

-#define UVM_ASSERT_MSG_RELEASE(expr, fmt, ...)  _UVM_ASSERT_MSG_RELEASE(expr, #expr, ": " fmt, ##__VA_ARGS__)
-#define UVM_ASSERT_RELEASE(expr)                _UVM_ASSERT_MSG_RELEASE(expr, #expr, "\n")
-
 // Prevent function calls in expr and the print argument list from being
 // evaluated.
 #define UVM_ASSERT_MSG_IGNORE(expr, fmt, ...)   \
@@ -170,13 +168,42 @@ void on_uvm_assert(void);

 // UVM_ASSERT and UVM_ASSERT_MSG are only enabled on non-release and Coverity builds
 #if UVM_IS_DEBUG() || defined __COVERITY__
-    #define UVM_ASSERT_MSG                  UVM_ASSERT_MSG_RELEASE
-    #define UVM_ASSERT                      UVM_ASSERT_RELEASE
+    #define UVM_ASSERT_MSG(expr, fmt, ...)  _UVM_ASSERT_MSG(expr, #expr, ": " fmt, ##__VA_ARGS__)
+    #define UVM_ASSERT(expr)                _UVM_ASSERT_MSG(expr, #expr, "\n")
 #else
    #define UVM_ASSERT_MSG(expr, fmt, ...)  UVM_ASSERT_MSG_IGNORE(expr, fmt, ##__VA_ARGS__)
    #define UVM_ASSERT(expr)                UVM_ASSERT_MSG_IGNORE(expr, "\n")
 #endif

+// UVM_ASSERT_RELEASE and UVM_ASSERT_MSG_RELEASE are always included in the
+// build, even on release builds. They are skipped at runtime if
+// uvm_release_asserts is 0.
+
+// Whether release asserts are enabled and whether they should dump the stack
+// and set the global error.
+extern int uvm_release_asserts;
+extern int uvm_release_asserts_dump_stack;
+extern int uvm_release_asserts_set_global_error;
+extern bool uvm_release_asserts_set_global_error_for_tests;
+
+// Given these are enabled for release builds, we need to be more cautious than
+// in UVM_ASSERT(). Use a ratelimited print and only dump the stack if a module
+// param is enabled.
+#define _UVM_ASSERT_MSG_RELEASE(expr, cond, fmt, ...)                                                   \
+    do {                                                                                                \
+        if (uvm_release_asserts && unlikely(!(expr))) {                                                 \
+            UVM_ASSERT_PRINT_RL("Assert failed, condition %s not true" fmt, cond, ##__VA_ARGS__);       \
+            if (uvm_release_asserts_set_global_error || uvm_release_asserts_set_global_error_for_tests) \
+                uvm_global_set_fatal_error(NV_ERR_INVALID_STATE);                                       \
+            if (uvm_release_asserts_dump_stack)                                                         \
+                dump_stack();                                                                           \
+            on_uvm_assert();                                                                            \
+        }                                                                                               \
+    } while (0)
+
+#define UVM_ASSERT_MSG_RELEASE(expr, fmt, ...)  _UVM_ASSERT_MSG_RELEASE(expr, #expr, ": " fmt, ##__VA_ARGS__)
+#define UVM_ASSERT_RELEASE(expr)                _UVM_ASSERT_MSG_RELEASE(expr, #expr, "\n")
+
 // Provide a short form of UUID's, typically for use in debug printing:
 #define ABBREV_UUID(uuid) (unsigned)(uuid)

--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@@ -25,6 +25,7 @@
 #include "uvm_lock.h"
 #include "uvm_global.h"
 #include "uvm_kvmalloc.h"
+#include "uvm_channel.h" // For UVM_GPU_SEMAPHORE_MAX_JUMP

 #define UVM_SEMAPHORE_SIZE 4
 #define UVM_SEMAPHORE_PAGE_SIZE PAGE_SIZE
@@ -467,9 +468,16 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    // push, it's easily guaranteed because of the small number of GPFIFO
    // entries available per channel (there could be at most as many pending
    // pushes as GPFIFO entries).
-    if (new_sem_value < old_sem_value)
+    if (unlikely(new_sem_value < old_sem_value))
        new_value += 1ULL << 32;

+    // Check for unexpected large jumps of the semaphore value
+    UVM_ASSERT_MSG_RELEASE(new_value - old_value <= UVM_GPU_SEMAPHORE_MAX_JUMP,
+                           "GPU %s unexpected semaphore (CPU VA 0x%llx) jump from 0x%llx to 0x%llx\n",
+                           tracking_semaphore->semaphore.page->pool->gpu->parent->name,
+                           (NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
+                           old_value, new_value);
+
    // Use an atomic write even though the spinlock is held so that the value can
    // be (carefully) read atomically outside of the lock.
    //
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore_test.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore_test.c
@@ -27,6 +27,18 @@
 #include "uvm_va_space.h"
 #include "uvm_kvmalloc.h"

+static NV_STATUS set_and_test(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 new_value)
+{
+    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, (NvU32)new_value);
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_update_completed_value(tracking_sem) == new_value);
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value));
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value - 1));
+    TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value + 1));
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_completed(tracking_sem));
+
+    return NV_OK;
+}
+
 static NV_STATUS add_and_test(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU32 increment_by)
 {
    NvU64 new_value;
@@ -43,13 +55,45 @@ static NV_STATUS add_and_test(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU32
    TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value));
    TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_completed(tracking_sem));

-    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, (NvU32)new_value);
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_update_completed_value(tracking_sem) == new_value);
+    TEST_NV_CHECK_RET(set_and_test(tracking_sem, new_value));
    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, completed));
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value));
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value - 1));
-    TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value + 1));
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_completed(tracking_sem));
+
+    return NV_OK;
+}
+
+// Set the current state of the sema, avoiding UVM_GPU_SEMAPHORE_MAX_JUMP
+// detection.
+static void manual_set(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 value)
+{
+    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, (NvU32)value);
+    atomic64_set(&tracking_sem->completed_value, value);
+    tracking_sem->queued_value = value;
+}
+
+// Set the starting value and payload and expect a global error
+static NV_STATUS set_and_expect_error(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 starting_value, NvU32 payload)
+{
+    manual_set(tracking_sem, starting_value);
+    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, payload);
+
+    TEST_CHECK_RET(uvm_global_get_status() == NV_OK);
+    uvm_gpu_tracking_semaphore_update_completed_value(tracking_sem);
+    TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_INVALID_STATE);
+
+    return NV_OK;
+}
+
+static NV_STATUS test_invalid_jumps(uvm_gpu_tracking_semaphore_t *tracking_sem)
+{
+    int i;
+    for (i = 0; i < 10; ++i) {
+        NvU64 base = (1ULL<<32) * i;
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base, UVM_GPU_SEMAPHORE_MAX_JUMP + 1));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base, UINT_MAX));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base + i + 1, i));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base + UINT_MAX / 2, UINT_MAX / 2 + UVM_GPU_SEMAPHORE_MAX_JUMP + 1));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base + UINT_MAX / 2, UINT_MAX / 2 - i - 1));
+    }

    return NV_OK;
 }
@@ -73,11 +117,31 @@ static NV_STATUS test_tracking(uvm_va_space_t *va_space)
        goto done;

    for (i = 0; i < 100; ++i) {
-        status = add_and_test(&tracking_sem, UINT_MAX - 1);
+        status = add_and_test(&tracking_sem, UVM_GPU_SEMAPHORE_MAX_JUMP - i);
        if (status != NV_OK)
            goto done;
    }

+    // Test wrap-around cases
+    for (i = 0; i < 100; ++i) {
+        // Start with a value right before wrap-around
+        NvU64 starting_value = (1ULL<<32) * (i + 1) - i - 1;
+        manual_set(&tracking_sem, starting_value);
+
+        // And set payload to after wrap-around
+        status = set_and_test(&tracking_sem, (1ULL<<32) * (i + 1) + i);
+        if (status != NV_OK)
+            goto done;
+    }
+
+    g_uvm_global.disable_fatal_error_assert = true;
+    uvm_release_asserts_set_global_error_for_tests = true;
+    status = test_invalid_jumps(&tracking_sem);
+    uvm_release_asserts_set_global_error_for_tests = false;
+    g_uvm_global.disable_fatal_error_assert = false;
+    if (status != NV_OK)
+        goto done;
+
 done:
    uvm_gpu_tracking_semaphore_free(&tracking_sem);
    return status;
--- a/kernel-open/nvidia-uvm/uvm_push.h
+++ b/kernel-open/nvidia-uvm/uvm_push.h
@@ -52,11 +52,21 @@ typedef enum
    // By default all operations include a membar sys after any transfer and
    // before a semaphore operation.
    // This flag indicates that next operation should use no membar at all.
+    //
+    // For end of push semaphore release, this flag indicates that the push
+    // itself does not need a membar to be used (membar sys is the default). A
+    // membar may still be used, if needed to order the semaphore release
+    // write. See comments in uvm_channel_end_push().
    UVM_PUSH_FLAG_NEXT_MEMBAR_NONE,

    // By default all operations include a membar sys after any transfer and
    // before a semaphore operation.
    // This flag indicates that next operation should use a membar gpu instead.
+    //
+    // For end of push semaphore release, this flag indicates that the push
+    // itself only needs a membar gpu (the default is membar sys). A membar sys
+    // may still be used, if needed to order the semaphore release write. See
+    // comments in uvm_channel_end_push().
    UVM_PUSH_FLAG_NEXT_MEMBAR_GPU,

    UVM_PUSH_FLAG_COUNT,
--- a/kernel-open/nvidia/nv-dmabuf.c
+++ b/kernel-open/nvidia/nv-dmabuf.c
@@ -820,8 +820,13 @@ nv_dma_buf_reuse(
        goto cleanup_dmabuf;
    }

+
+
+
+
    if (params->index > (priv->total_objects - params->numObjects))
    {
+
        status = NV_ERR_INVALID_ARGUMENT;
        goto unlock_priv;
    }
--- a/kernel-open/nvidia/nv-mmap.c
+++ b/kernel-open/nvidia/nv-mmap.c
@@ -132,6 +132,13 @@ nvidia_vma_access(
    pageIndex = ((addr - vma->vm_start) >> PAGE_SHIFT);
    pageOffset = (addr & ~PAGE_MASK);

+
+
+
+
+
+
+
    if (!mmap_context->valid)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: VM: invalid mmap context\n");
@@ -430,7 +437,7 @@ static int nvidia_mmap_numa(
    const nv_alloc_mapping_context_t *mmap_context)
 {
    NvU64 start, addr;
-    unsigned int pages;
+    NvU64 pages;
    NvU64 i;

    pages = NV_VMA_SIZE(vma) >> PAGE_SHIFT;
@@ -509,6 +516,13 @@ int nvidia_mmap_helper(
        NvU64 access_start = mmap_context->access_start;
        NvU64 access_len = mmap_context->access_size;

+
+
+
+
+
+
+
        if (IS_REG_OFFSET(nv, access_start, access_len))
        {
            if (nv_encode_caching(&vma->vm_page_prot, NV_MEMORY_UNCACHED,
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -1467,6 +1467,11 @@ static int nv_open_device(nv_state_t *nv, nvidia_stack_t *sp)
        return -ENODEV;
    }

+
+
+
+
+
    if ( ! (nv->flags & NV_FLAG_OPEN))
    {
        /* Sanity check: !NV_FLAG_OPEN requires usage_count == 0 */
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@@ -219,6 +219,7 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_dram_clk_to_mc_clk
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_get_dram_num_channels
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_dram_types
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_pxm_to_node
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_screen_info

 NV_CONFTEST_TYPE_COMPILE_TESTS += file_operations
 NV_CONFTEST_TYPE_COMPILE_TESTS += kuid_t
@@ -242,9 +243,9 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += vmalloc_has_pgprot_t_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
 NV_CONFTEST_TYPE_COMPILE_TESTS += pci_channel_state
 NV_CONFTEST_TYPE_COMPILE_TESTS += pci_dev_has_ats_enabled
-NV_CONFTEST_TYPE_COMPILE_TESTS += mt_device_gre
 NV_CONFTEST_TYPE_COMPILE_TESTS += remove_memory_has_nid_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += add_memory_driver_managed_has_mhp_flags_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += num_registered_fb

 NV_CONFTEST_GENERIC_COMPILE_TESTS += dom0_kernel_present
 NV_CONFTEST_GENERIC_COMPILE_TESTS += nvidia_vgpu_kvm_build
--- a/kernel-open/nvidia/nvlink_linux.c
+++ b/kernel-open/nvidia/nvlink_linux.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2015-2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2015-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -207,7 +207,10 @@ static int nvlink_fops_release(struct inode *inode, struct file *filp)

    nvlink_print(NVLINK_DBG_INFO, "nvlink driver close\n");

-    WARN_ON(private == NULL);
+
+
+
+

    mutex_lock(&nvlink_drvctx.lock);

--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -1120,11 +1120,14 @@ void NV_API_CALL os_get_screen_info(
    NvU64 consoleBar2Address
 )
 {
-#if defined(CONFIG_FB)
-    int i;
    *pPhysicalAddress = 0;
    *pFbWidth = *pFbHeight = *pFbDepth = *pFbPitch = 0;

+#if defined(CONFIG_FB) && defined(NV_NUM_REGISTERED_FB_PRESENT)
+    if (num_registered_fb > 0)
+    {
+        int i;
+
        for (i = 0; i < num_registered_fb; i++)
        {
            if (!registered_fb[i])
@@ -1142,9 +1145,33 @@ void NV_API_CALL os_get_screen_info(
                break;
            }
        }
-#else
-    *pPhysicalAddress = 0;
-    *pFbWidth = *pFbHeight = *pFbDepth = *pFbPitch = 0;
+    }
+#elif NV_IS_EXPORT_SYMBOL_PRESENT_screen_info
+    /*
+     * If there is not a framebuffer console, return 0 size.
+     *
+     * orig_video_isVGA is set to 1 during early Linux kernel
+     * initialization, and then will be set to a value, such as
+     * VIDEO_TYPE_VLFB or VIDEO_TYPE_EFI if an fbdev console is used.
+     */
+    if (screen_info.orig_video_isVGA > 1)
+    {
+        NvU64 physAddr = screen_info.lfb_base;
+#if defined(VIDEO_CAPABILITY_64BIT_BASE)
+        physAddr |= (NvU64)screen_info.ext_lfb_base << 32;
+#endif
+
+        /* Make sure base address is mapped to GPU BAR */
+        if ((physAddr == consoleBar1Address) ||
+            (physAddr == consoleBar2Address))
+        {
+            *pPhysicalAddress = physAddr;
+            *pFbWidth = screen_info.lfb_width;
+            *pFbHeight = screen_info.lfb_height;
+            *pFbDepth = screen_info.lfb_depth;
+            *pFbPitch = screen_info.lfb_linelength;
+        }
+    }
 #endif
 }

--- a/src/common/inc/nvlog_defs.h
+++ b/src/common/inc/nvlog_defs.h
@@ -195,6 +195,11 @@ extern NVLOG_LOGGER NvLogLogger;
 #define NVLOG_BUFFER_FLAGS_FORMAT_LIBOS_LOG              1
 #define NVLOG_BUFFER_FLAGS_FORMAT_MEMTRACK               2

+// Never deallocate this buffer until RM is unloaded
+#define NVLOG_BUFFER_FLAGS_PRESERVE                     11:11
+#define NVLOG_BUFFER_FLAGS_PRESERVE_NO                  0
+#define NVLOG_BUFFER_FLAGS_PRESERVE_YES                 1
+
 // Buffer GPU index
 #define NVLOG_BUFFER_FLAGS_GPU_INSTANCE              31:24

--- a/src/common/modeset/timing/nvtiming.h
+++ b/src/common/modeset/timing/nvtiming.h
@@ -4091,6 +4091,8 @@ typedef struct tagNVT_GAMUT_METADATA
 #define NVT_DPCD_ADDRESS_DOWN_REP_BUFFER_FIELD              0x01400
 #define NVT_DPCD_ADDRESS_UP_REQ_BUFFER_FIELD                0x01600
 #define NVT_DPCD_ADDRESS_DEVICE_SERVICE_IRQ_VECTOR_ESI0     0x02003
+#define NVT_DPCD_ADDRESS_DP_TUNNELING_DEVICE_IEEE_OUI       0xE0000
+#define NVT_DPCD_ADDRESS_DP_TUNNELING_DEVICE_ID_STRING      0xE0003
 #define NVT_DPCD_ADDRESS_DP_TUNNELING_CAPS_SUPPORT_FIELD    0xE000D
 #define NVT_DPCD_ADDRESS_DP_IN_ADAPTER_INFO_FIELD           0xE000E
 #define NVT_DPCD_ADDRESS_USB4_DRIVER_ID_FIELD               0xE000F
@@ -5079,7 +5081,7 @@ typedef struct tagNVT_DPCD_CONFIG

 typedef struct tagNVT_DPCD_DP_TUNNELING_CAPS
 {
-    NvU8 dpTunnelingSupport               : 1; // DP Tunneling through USB4 Support
+    NvU8 dpTunneling                      : 1; // DP Tunneling through USB4 Support
    NvU8 reserved                         : 5; // Reserved.
    NvU8 dpPanelReplayTunnelingOptSupport : 1; // Panel Replay Tunneling Optimization Support
    NvU8 dpInBwAllocationModeSupport      : 1; // DP IN Bandwidth Allocation Mode Support
--- a/src/common/nvlink/interface/nvlink_lib_ctrl.h
+++ b/src/common/nvlink/interface/nvlink_lib_ctrl.h
@@ -64,7 +64,7 @@
 * Total number of nvlink endpoints core library can have
 *  This is mapped to NVLINK_MAX_SYSTEM_LINK_NUM in drivers/nvlink/interface/nvlink.h
 */
-#define NVLINK_MAX_NVLINK_ENDPOINTS 312
+#define NVLINK_MAX_NVLINK_ENDPOINTS 624

 #define NVLINK_VERSION_STRING_LENGTH    64

--- a/src/common/nvlink/kernel/nvlink/interface/nvlink_ioctl_entry.c
+++ b/src/common/nvlink/kernel/nvlink/interface/nvlink_ioctl_entry.c
@@ -28,6 +28,7 @@
 #include "../nvlink_ctx.h"
 #include "../nvlink_helper.h"
 #include "nvlink_lock.h"
+#include "nvctassert.h"

 #define NVLINK_IOC_GET_BUF(ctrlParams, type) (ctrlParams)->size >= sizeof(type) ? (type *) (ctrlParams)->buf : NULL

@@ -3423,6 +3424,8 @@ nvlink_lib_ctrl_get_device_link_states
    NvU32         numLinks  = 0;
    NvU32         i         = 0;

+    ct_assert(NVLINK_MAX_SYSTEM_LINK_NUM == NVLINK_MAX_NVLINK_ENDPOINTS);
+
    nvlink_link   **links = (nvlink_link **)nvlink_malloc(
                            sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
    if (links == NULL)
--- a/src/common/uproc/os/libos-v2.0.0/debug/logdecode.c
+++ b/src/common/uproc/os/libos-v2.0.0/debug/logdecode.c
@@ -1041,24 +1041,41 @@ static NvBool libosCopyLogToNvlog_nowrap(LIBOS_LOG_DECODE_LOG *pLog)
    NvU64 putCopy                      = pLog->physicLogBuffer[0];
    NvU64 putOffset                    = putCopy * sizeof(NvU64) + sizeof(NvU64);

-    if (putOffset == pNvLogBuffer->pos)
+    //
+    // If RM was not unloaded, we will reuse a preserved nowrap nvlog buffer with the fresh
+    // physical log buffer. In this case, we fix up all the offsets into the nvlog buffer to be
+    // relative to its preserved position rather than the start.
+    //
+    NvU64 nvlogPos                     = pNvLogBuffer->pos - pLog->preservedNoWrapPos;
+
+    if (putOffset < nvlogPos)
+    {
+        // Buffer put counter unexpectedly reset. Terminate nowrap log collection.
+        return NV_FALSE;
+    }
+
+    if (putOffset == nvlogPos)
    {
        // No new data
        return NV_TRUE;
    }

-    if (putOffset > pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64))
+    if (putOffset + pLog->preservedNoWrapPos >
+        pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64))
    {
        // Are we done filling nowrap?
        return NV_FALSE;
    }

-    NvU64 len  = putOffset - pNvLogBuffer->pos;
-    NvU8 *pSrc = ((NvU8 *)pLog->physicLogBuffer) + pNvLogBuffer->pos;
+    NvU64 len  = putOffset - nvlogPos;
+    NvU8 *pSrc = ((NvU8 *)pLog->physicLogBuffer) + nvlogPos;
    NvU8 *pDst = pNoWrapBuf->data + pNvLogBuffer->pos;
+
+    pLog->bDidPush = NV_TRUE;
+
    portMemCopy(pDst, len, pSrc, len);
-    pNvLogBuffer->pos            = putOffset; // TODO: usage of NVLOG_BUFFER::pos is sus here, reconsider?
-    *(NvU64 *)(pNoWrapBuf->data) = putCopy;
+    pNvLogBuffer->pos            = putOffset + pLog->preservedNoWrapPos; // TODO: usage of NVLOG_BUFFER::pos is sus here, reconsider?
+    *(NvU64 *)(pNoWrapBuf->data) = putCopy + pLog->preservedNoWrapPos / sizeof(NvU64);
    return NV_TRUE;
 }

@@ -1095,6 +1112,46 @@ static void libosExtractLogs_nvlog(LIBOS_LOG_DECODE *logDecode, NvBool bSyncNvLo
    }
 }

+void libosPreserveLogs(LIBOS_LOG_DECODE *pLogDecode)
+{
+    NvU64 i;
+    for (i = 0; i < pLogDecode->numLogBuffers; i++)
+    {
+        LIBOS_LOG_DECODE_LOG *pLog = &pLogDecode->log[i];
+
+        if (pLog->bDidPush)
+        {
+            NvHandle hNvlog = pLog->hNvLogNoWrap;
+            NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[hNvlog];
+
+            if (hNvlog == 0 || pNvLogBuffer == NULL)
+                continue;
+
+            pNvLogBuffer->flags |= DRF_DEF(LOG, _BUFFER_FLAGS, _PRESERVE, _YES);
+        }
+    }
+}
+
+static NvBool findPreservedNvlogBuffer(NvU32 tag, NvU32 gpuInstance, NVLOG_BUFFER_HANDLE *pHandle)
+{
+    NVLOG_BUFFER_HANDLE handle = 0;
+    NV_STATUS status = nvlogGetBufferHandleFromTag(tag, &handle);
+
+    if (status != NV_OK)
+        return NV_FALSE;
+
+    NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[handle];
+    if (FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pNvLogBuffer->flags) &&
+        DRF_VAL(LOG, _BUFFER_FLAGS, _GPU_INSTANCE, pNvLogBuffer->flags) == gpuInstance &&
+        (pNvLogBuffer->pos < pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64)))
+    {
+        *pHandle = handle;
+        return NV_TRUE;
+    }
+
+    return NV_FALSE;
+}
+
 #endif // LIBOS_LOG_TO_NVLOG

 /**
@@ -1211,11 +1268,18 @@ void libosLogAddLogEx(LIBOS_LOG_DECODE *logDecode, void *buffer, NvU64 bufferSiz
    pLog->hNvLogWrap   = 0;
    pLog->bNvLogNoWrap = NV_FALSE;

-    LIBOS_LOG_NVLOG_BUFFER *pNoWrapBuf;
+    pLog->bDidPush             = NV_FALSE;
+    pLog->preservedNoWrapPos   = 0;

+    LIBOS_LOG_NVLOG_BUFFER *pNoWrapBuf;
+    NvU32 tag = LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2);
+    NvBool bFoundPreserved = findPreservedNvlogBuffer(tag, gpuInstance, &pLog->hNvLogNoWrap);
+
+    if (!bFoundPreserved)
+    {
        status = nvlogAllocBuffer(
            bufferSize + NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data), libosNoWrapBufferFlags,
-        LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2),
+            tag,
            &pLog->hNvLogNoWrap);

        if (status == NV_OK)
@@ -1237,13 +1301,27 @@ void libosLogAddLogEx(LIBOS_LOG_DECODE *logDecode, void *buffer, NvU64 bufferSiz
        {
            printf("nvlogAllocBuffer nowrap failed\n");
        }
+    }
+    else
+    {
+        pLog->bNvLogNoWrap = NV_TRUE;
+        pLog->preservedNoWrapPos = NvLogLogger.pBuffers[pLog->hNvLogNoWrap]->pos;
+
+        //
+        // The 0th NvU64 is the last value of put pointer from the physical log buffer, which is
+        // the number of NvU64 log buffer elements in it plus one.
+        // Subtract one NvU64 from it to avoid off-by-one error.
+        //
+        if (pLog->preservedNoWrapPos >= sizeof(NvU64))
+            pLog->preservedNoWrapPos -= sizeof(NvU64);
+    }

    LIBOS_LOG_NVLOG_BUFFER *pWrapBuf;
+    tag = LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2 + 1);

    status = nvlogAllocBuffer(
        bufferSize + NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data), libosWrapBufferFlags,
-        LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2 + 1),
-        &pLog->hNvLogWrap);
+        tag, &pLog->hNvLogWrap);

    if (status == NV_OK)
    {
@@ -1349,13 +1427,13 @@ void libosLogDestroy(LIBOS_LOG_DECODE *logDecode)

        if (pLog->hNvLogNoWrap != 0)
        {
-            nvlogDeallocBuffer(pLog->hNvLogNoWrap);
+            nvlogDeallocBuffer(pLog->hNvLogNoWrap, NV_FALSE);
            pLog->hNvLogNoWrap = 0;
        }

        if (pLog->hNvLogWrap != 0)
        {
-            nvlogDeallocBuffer(pLog->hNvLogWrap);
+            nvlogDeallocBuffer(pLog->hNvLogWrap, NV_FALSE);
            pLog->hNvLogWrap = 0;
        }
    }
--- a/src/common/uproc/os/libos-v2.0.0/debug/logdecode.h
+++ b/src/common/uproc/os/libos-v2.0.0/debug/logdecode.h
@@ -108,6 +108,9 @@ struct LIBOS_LOG_DECODE_LOG
    NvU32 hNvLogNoWrap;  // No wrap buffer captures first records.
    NvU32 hNvLogWrap;    // Wrap buffer captures last records.
    NvBool bNvLogNoWrap; // NV_TRUE if no wrap buffer not full.
+
+    NvBool bDidPush;     // NV_TRUE if this buffer was ever pushed to
+    NvU64 preservedNoWrapPos; // Position in preserved nvlog buffer
 #endif

 #if LIBOS_LOG_DECODE_ENABLE
@@ -170,6 +173,8 @@ void libosLogDestroy(LIBOS_LOG_DECODE *logDecode);

 void libosExtractLogs(LIBOS_LOG_DECODE *logDecode, NvBool bSyncNvLog);

+void libosPreserveLogs(LIBOS_LOG_DECODE *pLogDecode);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
+++ b/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
@@ -110,6 +110,7 @@ typedef struct {
    } set_maxval;
 } NvKmsSyncPtOpParams;

+NvBool nvkms_output_rounding_fix(void);

 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
--- a/src/nvidia-modeset/src/nvkms-evo3.c
+++ b/src/nvidia-modeset/src/nvkms-evo3.c
@@ -1288,6 +1288,8 @@ static void EvoSetOCsc0C5(NVDispEvoPtr pDispEvo, const NvU32 head)

    const float32_t zeroF32 = NvU32viewAsF32(NV_FLOAT_ZERO);
    const float32_t oneF32 = NvU32viewAsF32(NV_FLOAT_ONE);
+    const float32_t inv2048F32 = f32_div(NvU32viewAsF32(NV_FLOAT_HALF),
+                                         NvU32viewAsF32(NV_FLOAT_1024));
    /* divide satCos by the default setting of 1024 */
    const float32_t satCos = f32_div(i32_to_f32(pHeadState->procAmp.satCos),
                                     NvU32viewAsF32(NV_FLOAT_1024));
@@ -1324,6 +1326,12 @@ static void EvoSetOCsc0C5(NVDispEvoPtr pDispEvo, const NvU32 head)
    ocsc0Matrix = nvMultiply3x4Matrix(&satHueMatrix, &ocsc0Matrix);
    ocsc0Matrix = nvMultiply3x4Matrix(&CrYCbtoRGBMatrix, &ocsc0Matrix);

+    if (nvkms_output_rounding_fix()) {
+        ocsc0Matrix.m[0][3] = f32_add(ocsc0Matrix.m[0][3], inv2048F32);
+        ocsc0Matrix.m[1][3] = f32_add(ocsc0Matrix.m[1][3], inv2048F32);
+        ocsc0Matrix.m[2][3] = f32_add(ocsc0Matrix.m[2][3], inv2048F32);
+    }
+
    nvDmaSetStartEvoMethod(pChannel, NVC57D_HEAD_SET_OCSC0COEFFICIENT_C00(head), 12);
    nvDmaSetEvoMethodData(pChannel, DRF_NUM(C57D, _HEAD_SET_OCSC0COEFFICIENT_C00, _VALUE, cscCoefConvertS514(ocsc0Matrix.m[0][0])));
    nvDmaSetEvoMethodData(pChannel, DRF_NUM(C57D, _HEAD_SET_OCSC0COEFFICIENT_C01, _VALUE, cscCoefConvertS514(ocsc0Matrix.m[0][1])));
@@ -1965,11 +1973,13 @@ static inline NvU32 GetMaxPixelsFetchedPerLine(NvU16 inWidth,
 static void SetScalingUsageBoundsOneWindow5(
                                NVDevEvoPtr pDevEvo, NvU32 window,
                                const struct NvKmsScalingUsageBounds *pScaling,
+                                NvBool layerUsable,
                                const NVHwModeViewPortEvo *pViewPort,
                                NVEvoUpdateState *updateState)
 {
    NVEvoChannelPtr pChannel = pDevEvo->core;
    NvU32 setWindowUsageBounds = NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C5;
+    NvU32 maxPixelsFetchedPerLine;

    nvUpdateUpdateState(pDevEvo, updateState, pChannel);

@@ -1981,10 +1991,15 @@ static void SetScalingUsageBoundsOneWindow5(
        DRF_NUM(C57D, _WINDOW_SET_MAX_INPUT_SCALE_FACTOR, _VERTICAL,
                pScaling->maxVDownscaleFactor));

+    if (layerUsable) {
+        maxPixelsFetchedPerLine = GetMaxPixelsFetchedPerLine(pViewPort->in.width,
+                                                   pScaling->maxHDownscaleFactor);
+    } else {
+        maxPixelsFetchedPerLine = 0;
+    }
+
    setWindowUsageBounds |=
-        (DRF_NUM(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _MAX_PIXELS_FETCHED_PER_LINE,
-                 GetMaxPixelsFetchedPerLine(pViewPort->in.width,
-                 pScaling->maxHDownscaleFactor))) |
+        (DRF_NUM(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _MAX_PIXELS_FETCHED_PER_LINE,maxPixelsFetchedPerLine)) |
        (pScaling->vTaps >= NV_EVO_SCALER_5TAPS ?
            DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _INPUT_SCALER_TAPS, _TAPS_5) :
            DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _INPUT_SCALER_TAPS, _TAPS_2)) |
@@ -2056,8 +2071,9 @@ static NvBool EvoSetUsageBoundsC5(NVDevEvoPtr pDevEvo, NvU32 sd, NvU32 head,
    needCoreUpdate = EvoSetUsageBounds3(pDevEvo, sd, head, pUsage, updateState);

    for (layer = 0; layer < pDevEvo->head[head].numLayers; layer++) {
-        if (!nvEvoScalingUsageBoundsEqual(&pCurrentUsage->layer[layer].scaling,
-                                          &pUsage->layer[layer].scaling)) {
+        if ((pCurrentUsage->layer[layer].usable != pUsage->layer[layer].usable) ||
+            (!nvEvoScalingUsageBoundsEqual(&pCurrentUsage->layer[layer].scaling,
+                                           &pUsage->layer[layer].scaling))) {
            const NVHwModeViewPortEvo *pViewPort =
                &pDevEvo->gpus[sd].pDispEvo->headState[head].timings.viewPort;

@@ -2066,6 +2082,7 @@ static NvBool EvoSetUsageBoundsC5(NVDevEvoPtr pDevEvo, NvU32 sd, NvU32 head,
                NV_EVO_CHANNEL_MASK_WINDOW_NUMBER(
                    pDevEvo->head[head].layer[layer]->channelMask),
                &pUsage->layer[layer].scaling,
+                pUsage->layer[layer].usable,
                pViewPort,
                updateState);
            needCoreUpdate = TRUE;
@@ -4383,7 +4400,9 @@ static void EvoSetLUTContextDmaC5(const NVDispEvoRec *pDispEvo,

    nvDmaSetStartEvoMethod(pChannel, NVC57D_HEAD_SET_OLUT_CONTROL(head), 1);
    nvDmaSetEvoMethodData(pChannel,
-        DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _INTERPOLATE, _ENABLE) |
+        (!nvkms_output_rounding_fix() ?
+            DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _INTERPOLATE, _ENABLE) :
+            DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _INTERPOLATE, _DISABLE)) |
        DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _MIRROR, _DISABLE) |
        DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _MODE, _DIRECT10) |
        DRF_NUM(C57D, _HEAD_SET_OLUT_CONTROL, _SIZE, NV_LUT_VSS_HEADER_SIZE +
@@ -5180,13 +5199,11 @@ static NvBool EvoSetViewportInOut3(NVDevEvoPtr pDevEvo, const int head,
                                   const NVHwModeViewPortEvo *pViewPortMin,
                                   const NVHwModeViewPortEvo *pViewPort,
                                   const NVHwModeViewPortEvo *pViewPortMax,
-                                   NVEvoUpdateState *updateState,
-                                   NvU32 setWindowUsageBounds)
+                                   NVEvoUpdateState *updateState)
 {
    const NVEvoCapabilitiesPtr pEvoCaps = &pDevEvo->gpus[0].capabilities;
    NVEvoChannelPtr pChannel = pDevEvo->core;
    struct NvKmsScalingUsageBounds scalingUsageBounds = { };
-    NvU32 win;

    /* These methods should only apply to a single pDpy */
    nvAssert(pDevEvo->subDevMaskStackDepth > 0);
@@ -5232,9 +5249,33 @@ static NvBool EvoSetViewportInOut3(NVDevEvoPtr pDevEvo, const int head,
        DRF_NUM(C37D, _HEAD_SET_MAX_OUTPUT_SCALE_FACTOR, _VERTICAL,
                scalingUsageBounds.maxVDownscaleFactor));

+    return scalingUsageBounds.vUpscalingAllowed;
+}
+
+static void EvoSetViewportInOutC3(NVDevEvoPtr pDevEvo, const int head,
+                                  const NVHwModeViewPortEvo *pViewPortMin,
+                                  const NVHwModeViewPortEvo *pViewPort,
+                                  const NVHwModeViewPortEvo *pViewPortMax,
+                                  NVEvoUpdateState *updateState)
+{
+    NVEvoChannelPtr pChannel = pDevEvo->core;
+    NvU32 win;
+    NvU32 setWindowUsageBounds = NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C3;
+    NvBool verticalUpscalingAllowed =
+        EvoSetViewportInOut3(pDevEvo, head, pViewPortMin, pViewPort,
+                             pViewPortMax, updateState);
+
+    nvDmaSetStartEvoMethod(pChannel,
+        NVC37D_HEAD_SET_HEAD_USAGE_BOUNDS(head), 1);
+    nvDmaSetEvoMethodData(pChannel,
+        DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _CURSOR, _USAGE_W256_H256) |
+        DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _OUTPUT_LUT, _USAGE_1025) |
+        (verticalUpscalingAllowed ?
+            DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _UPSCALING_ALLOWED, _TRUE) :
+            DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _UPSCALING_ALLOWED, _FALSE)));
     /*
      * Program MAX_PIXELS_FETCHED_PER_LINE window usage bounds
-     * for each window that’s attached to the head.
+      * for each window that is attached to the head.
      *
      * Precomp will clip the post-scaled window to the input viewport, reverse-scale
      * this cropped size back to the input surface domain, and isohub will fetch
@@ -5242,7 +5283,11 @@ static NvBool EvoSetViewportInOut3(NVDevEvoPtr pDevEvo, const int head,
      * so the MAX_PIXELS_FETCHED_PER_LINE will be bounded by the input viewport
      * width. SetScalingUsageBoundsOneWindow5() will take care of updating
      * MAX_PIXELS_FETCHED_PER_LINE, if window scaling is enabled later.
+      * On Volta, Program for each window that is attached to head. For turing+,
+      * SetScalingUsageBoundsOneWindow5() will take care of programming window
+      * usage bounds only for the layers/windows in use.
      */
+
    setWindowUsageBounds |=
       DRF_NUM(C37D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _MAX_PIXELS_FETCHED_PER_LINE,
               GetMaxPixelsFetchedPerLine(pViewPort->in.width,
@@ -5256,30 +5301,6 @@ static NvBool EvoSetViewportInOut3(NVDevEvoPtr pDevEvo, const int head,
        nvDmaSetStartEvoMethod(pChannel, NVC37D_WINDOW_SET_WINDOW_USAGE_BOUNDS(win), 1);
        nvDmaSetEvoMethodData(pChannel, setWindowUsageBounds);
    }
-
-    return scalingUsageBounds.vUpscalingAllowed;
-}
-
-static void EvoSetViewportInOutC3(NVDevEvoPtr pDevEvo, const int head,
-                                  const NVHwModeViewPortEvo *pViewPortMin,
-                                  const NVHwModeViewPortEvo *pViewPort,
-                                  const NVHwModeViewPortEvo *pViewPortMax,
-                                  NVEvoUpdateState *updateState)
-{
-    NVEvoChannelPtr pChannel = pDevEvo->core;
-    NvBool verticalUpscalingAllowed =
-        EvoSetViewportInOut3(pDevEvo, head, pViewPortMin, pViewPort,
-                             pViewPortMax, updateState,
-                             NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C3);
-
-    nvDmaSetStartEvoMethod(pChannel,
-        NVC37D_HEAD_SET_HEAD_USAGE_BOUNDS(head), 1);
-    nvDmaSetEvoMethodData(pChannel,
-        DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _CURSOR, _USAGE_W256_H256) |
-        DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _OUTPUT_LUT, _USAGE_1025) |
-        (verticalUpscalingAllowed ?
-            DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _UPSCALING_ALLOWED, _TRUE) :
-            DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _UPSCALING_ALLOWED, _FALSE)));
 }

 static void EvoSetViewportInOutC5(NVDevEvoPtr pDevEvo, const int head,
@@ -5289,13 +5310,9 @@ static void EvoSetViewportInOutC5(NVDevEvoPtr pDevEvo, const int head,
                                  NVEvoUpdateState *updateState)
 {
    NVEvoChannelPtr pChannel = pDevEvo->core;
-    NvU32 setWindowUsageBounds =
-        (NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C5 |
-         DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _INPUT_SCALER_TAPS, _TAPS_2) |
-         DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _UPSCALING_ALLOWED, _FALSE));
    NvU32 verticalUpscalingAllowed =
        EvoSetViewportInOut3(pDevEvo, head, pViewPortMin, pViewPort,
-                             pViewPortMax, updateState, setWindowUsageBounds);
+                             pViewPortMax, updateState);

    nvDmaSetStartEvoMethod(pChannel,
        NVC57D_HEAD_SET_HEAD_USAGE_BOUNDS(head), 1);
--- a/src/nvidia/arch/nvalloc/unix/include/nv.h
+++ b/src/nvidia/arch/nvalloc/unix/include/nv.h
@@ -619,27 +619,33 @@ typedef enum
 #define NV_GET_NV_STATE(pGpu) \
    (nv_state_t *)((pGpu) ? (pGpu)->pOsGpuInfo : NULL)

-#define IS_REG_OFFSET(nv, offset, length)                                       \
-    (((offset) >= (nv)->regs->cpu_address) &&                                   \
-    (((offset) + ((length)-1)) <=                                               \
-        (nv)->regs->cpu_address + ((nv)->regs->size-1)))
+static inline NvBool IS_REG_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((offset >= nv->regs->cpu_address) &&
+            ((offset + (length - 1)) <= (nv->regs->cpu_address + (nv->regs->size - 1))));
+}

-#define IS_FB_OFFSET(nv, offset, length)                                        \
-    (((nv)->fb) && ((offset) >= (nv)->fb->cpu_address) &&                       \
-    (((offset) + ((length)-1)) <= (nv)->fb->cpu_address + ((nv)->fb->size-1)))
+static inline NvBool IS_FB_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return  ((nv->fb) && (offset >= nv->fb->cpu_address) &&
+             ((offset + (length - 1)) <= (nv->fb->cpu_address + (nv->fb->size - 1))));
+}

-#define IS_UD_OFFSET(nv, offset, length)                                        \
-    (((nv)->ud.cpu_address != 0) && ((nv)->ud.size != 0) &&                     \
-    ((offset) >= (nv)->ud.cpu_address) &&                                       \
-    (((offset) + ((length)-1)) <= (nv)->ud.cpu_address + ((nv)->ud.size-1)))
+static inline NvBool IS_UD_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->ud.cpu_address != 0) && (nv->ud.size != 0) &&
+            (offset >= nv->ud.cpu_address) &&
+            ((offset + (length - 1)) <= (nv->ud.cpu_address + (nv->ud.size - 1))));
+}

-#define IS_IMEM_OFFSET(nv, offset, length)                                      \
-    (((nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&                    \
-     ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&                           \
-     ((offset) >= (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&             \
-     (((offset) + ((length) - 1)) <=                                            \
-        (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +                         \
-            ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size - 1)))
+static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&
+            (nv->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&
+            (offset >= nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&
+            ((offset + (length - 1)) <= (nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +
+                                         (nv->bars[NV_GPU_BAR_INDEX_IMEM].size - 1))));
+}

 #define NV_RM_MAX_MSIX_LINES  8

--- a/src/nvidia/arch/nvalloc/unix/src/osapi.c
+++ b/src/nvidia/arch/nvalloc/unix/src/osapi.c
@@ -780,10 +780,8 @@ static NV_STATUS RmAccessRegistry(
            RmStatus = NV_ERR_INVALID_STRING_LENGTH;
            goto done;
        }
-
        // get access to client's parmStr
        RMAPI_PARAM_COPY_INIT(parmStrParamCopy, tmpParmStr, clientParmStrAddress, ParmStrLength, 1);
-        parmStrParamCopy.flags |= RMAPI_PARAM_COPY_FLAGS_ZERO_BUFFER;
        RmStatus = rmapiParamsAcquire(&parmStrParamCopy, NV_TRUE);
        if (RmStatus != NV_OK)
        {
@@ -2026,6 +2024,7 @@ static NV_STATUS RmGetAllocPrivate(
    PMEMORY_DESCRIPTOR pMemDesc;
    NvU32 pageOffset;
    NvU64 pageCount;
+    NvU64 endingOffset;
    RsResourceRef *pResourceRef;
    RmResource *pRmResource;
    void *pMemData;
@@ -2086,8 +2085,9 @@ static NV_STATUS RmGetAllocPrivate(
    if (rmStatus != NV_OK)
        goto done;

-    pageCount = ((pageOffset + length) / os_page_size);
-    pageCount += (*pPageIndex + (((pageOffset + length) % os_page_size) ? 1 : 0));
+    endingOffset = pageOffset + length;
+    pageCount = (endingOffset / os_page_size);
+    pageCount += (*pPageIndex + ((endingOffset % os_page_size) ? 1 : 0));

    if (pageCount > NV_RM_PAGES_TO_OS_PAGES(pMemDesc->PageCount))
    {
--- a/src/nvidia/arch/nvalloc/unix/src/osinit.c
+++ b/src/nvidia/arch/nvalloc/unix/src/osinit.c
@@ -362,10 +362,6 @@ osHandleGpuLost
    pmc_boot_0 = NV_PRIV_REG_RD32(nv->regs->map_u, NV_PMC_BOOT_0);
    if (pmc_boot_0 != nvp->pmc_boot_0)
    {
-        RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
-        NV2080_CTRL_GPU_GET_OEM_BOARD_INFO_PARAMS *pBoardInfoParams;
-        NV_STATUS status;
-
        //
        // This doesn't support PEX Reset and Recovery yet.
        // This will help to prevent accessing registers of a GPU
@@ -376,24 +372,11 @@ osHandleGpuLost

        NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "GPU has fallen off the bus.\n");

-        pBoardInfoParams = portMemAllocNonPaged(sizeof(*pBoardInfoParams));
-        if (pBoardInfoParams != NULL)
-        {
-            portMemSet(pBoardInfoParams, 0, sizeof(*pBoardInfoParams));
-
-            status = pRmApi->Control(pRmApi, nv->rmapi.hClient,
-                                     nv->rmapi.hSubDevice,
-                                     NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO,
-                                     pBoardInfoParams,
-                                     sizeof(*pBoardInfoParams));
-            if (status == NV_OK)
+        if (pGpu->boardInfo != NULL && pGpu->boardInfo->serialNumber[0] != '\0')
        {
            NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                          "GPU serial number is %s.\n",
-                              pBoardInfoParams->serialNumber);
-            }
-
-            portMemFree(pBoardInfoParams);
+                          pGpu->boardInfo->serialNumber);
        }

        gpuSetDisconnectedProperties(pGpu);
--- a/src/nvidia/generated/g_gpu_nvoc.h
+++ b/src/nvidia/generated/g_gpu_nvoc.h
@@ -60,6 +60,7 @@ typedef struct GPUATTACHARG GPUATTACHARG;
 * */
 #include "ctrl/ctrl0080/ctrl0080gpu.h" // NV0080_CTRL_GPU_GET_SRIOV_CAPS_PARAMS (form hal)
 #include "ctrl/ctrl2080/ctrl2080internal.h" // NV2080_CTRL_CMD_INTERNAL_MAX_BSPS/NVENCS
+#include "ctrl/ctrl2080/ctrl2080ecc.h"
 #include "ctrl/ctrl2080/ctrl2080nvd.h"
 #include "class/cl2080.h"
 #include "class/cl90cd.h"
--- a/src/nvidia/generated/g_kernel_gsp_nvoc.h
+++ b/src/nvidia/generated/g_kernel_gsp_nvoc.h
@@ -301,6 +301,7 @@ struct KernelGsp {
    LIBOS_LOG_DECODE logDecode;
    RM_LIBOS_LOG_MEM rmLibosLogMem[2];
    void *pLogElf;
+    NvBool bInInit;
    MEMORY_DESCRIPTOR *pMemDesc_simAccessBuf;
    SimAccessBuffer *pSimAccessBuf;
    NvP64 pSimAccessBufPriv;
--- a/src/nvidia/generated/g_nv_name_released.h
+++ b/src/nvidia/generated/g_nv_name_released.h
@@ -806,6 +806,8 @@ static const CHIPS_RELEASED sChipsReleased[] = {
    { 0x20B0, 0x1450, 0x10de, "NVIDIA A100-PG509-200" },
    { 0x20B2, 0x1463, 0x10de, "NVIDIA A100-SXM4-80GB" },
    { 0x20B2, 0x147f, 0x10de, "NVIDIA A100-SXM4-80GB" },
+    { 0x20B2, 0x1622, 0x10de, "NVIDIA A100-SXM4-80GB" },
+    { 0x20B2, 0x1623, 0x10de, "NVIDIA A100-SXM4-80GB" },
    { 0x20B3, 0x14a7, 0x10de, "NVIDIA PG506-242" },
    { 0x20B3, 0x14a8, 0x10de, "NVIDIA PG506-243" },
    { 0x20B5, 0x1533, 0x10de, "NVIDIA A100 80GB PCIe" },
@@ -907,6 +909,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
    { 0x2507, 0x0000, 0x0000, "NVIDIA GeForce RTX 3050" },
    { 0x2508, 0x0000, 0x0000, "NVIDIA GeForce RTX 3050 OEM" },
    { 0x2520, 0x0000, 0x0000, "NVIDIA GeForce RTX 3060 Laptop GPU" },
+    { 0x2521, 0x0000, 0x0000, "NVIDIA GeForce RTX 3060 Laptop GPU" },
    { 0x2523, 0x0000, 0x0000, "NVIDIA GeForce RTX 3050 Ti Laptop GPU" },
    { 0x2531, 0x151d, 0x1028, "NVIDIA RTX A2000" },
    { 0x2531, 0x151d, 0x103c, "NVIDIA RTX A2000" },
--- a/src/nvidia/inc/libraries/nvlog/nvlog.h
+++ b/src/nvidia/inc/libraries/nvlog/nvlog.h
@@ -86,8 +86,9 @@ NV_STATUS nvlogAllocBuffer(NvU32 size, NvU32 flags, NvU32 tag, NVLOG_BUFFER_HAND
 * @brief Deallocate a buffer with the given handle
 *
 * @param[in]   hBuffer     Handle of the buffer to deallocate
+ * @param[in]   bDeallocPreserved Deallocate preserved buffers
 */
-void nvlogDeallocBuffer(NVLOG_BUFFER_HANDLE hBuffer);
+void nvlogDeallocBuffer(NVLOG_BUFFER_HANDLE hBuffer, NvBool bDeallocPreserved);

 /**
 * @brief Write to a buffer with the given handle
--- a/src/nvidia/kernel/vgpu/nv/rpc.c
+++ b/src/nvidia/kernel/vgpu/nv/rpc.c
@@ -265,8 +265,11 @@ static NV_STATUS _issueRpcLarge
    // should not be called in broadcast mode
    NV_ASSERT_OR_RETURN(!gpumgrGetBcEnabledStatus(pGpu), NV_ERR_INVALID_STATE);

+    //
    // Copy the initial buffer
-    entryLength = NV_MIN(bufSize, pRpc->maxRpcSize);
+    // Temporary black magic WAR for bug 3594082: reducing the size by 1
+    //
+    entryLength = NV_MIN(bufSize, pRpc->maxRpcSize - 1);

    if ((NvU8 *)vgpu_rpc_message_header_v != pBuf8)
        portMemCopy(vgpu_rpc_message_header_v, entryLength, pBuf8, entryLength);
@@ -291,8 +294,11 @@ static NV_STATUS _issueRpcLarge
    remainingSize -= entryLength;
    pBuf8   += entryLength;

+    //
    // Copy the remaining buffers
-    entryLength = pRpc->maxRpcSize - sizeof(rpc_message_header_v);
+    // Temporary black magic WAR for bug 3594082: reducing the size by 1
+    //
+    entryLength = pRpc->maxRpcSize - sizeof(rpc_message_header_v) - 1;
    while (remainingSize != 0)
    {
        if (entryLength > remainingSize)
--- a/src/nvidia/src/kernel/diagnostics/nvlog.c
+++ b/src/nvidia/src/kernel/diagnostics/nvlog.c
@@ -103,7 +103,7 @@ nvlogDestroy()
    tlsShutdown();
    for (i = 0; i < NVLOG_MAX_BUFFERS; i++)
    {
-        nvlogDeallocBuffer(i);
+        nvlogDeallocBuffer(i, NV_TRUE);
    }
    if (NvLogLogger.mainLock != NULL)
    {
@@ -261,7 +261,8 @@ nvlogAllocBuffer
 void
 nvlogDeallocBuffer
 (
-    NVLOG_BUFFER_HANDLE hBuffer
+    NVLOG_BUFFER_HANDLE hBuffer,
+    NvBool bDeallocPreserved
 )
 {
    NVLOG_BUFFER *pBuffer;
@@ -271,6 +272,12 @@ nvlogDeallocBuffer

    pBuffer = NvLogLogger.pBuffers[hBuffer];

+    if (FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pBuffer->flags) &&
+        !bDeallocPreserved)
+    {
+        return;
+    }
+
    pBuffer->flags = FLD_SET_DRF(LOG_BUFFER, _FLAGS, _DISABLED,
                                 _YES, pBuffer->flags);

--- a/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c
+++ b/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c
@@ -2502,17 +2502,21 @@ kbusFlushSingle_GM107
            if (IS_GSP_CLIENT(pGpu))
            {
                //
-                // on GSP client, we only support PCIE_READ to do flush
-                // a sysmembar flush should call kbusSendSysmembarSingle_HAL explicitly
+                // on GSP client, we should use PCIE_READ to do video memory flush.
+                // A sysmembar flush that touches registers is done through RPC and has
+                // lower effeciency.  For cases where it needs sysmembar, the caller site
+                // should use kbusSendSysmembarSingle_HAL explicitly.
                //
-                NV_ASSERT_OR_RETURN(0, NV_ERR_INVALID_PATH);
+                NV_ASSERT(0);
+
+                // This will dump a stack trace to assist debug on certain
+                // platforms.
+                osAssertFailed();
            }
-            else
-            {
+
            return kbusSendSysmembarSingle_HAL(pGpu, pKernelBus);
        }
    }
-    }

    return NV_OK;
 }
--- a/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c
+++ b/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c
@@ -3750,6 +3750,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
    Memory *pMemory;
    ContextDma *pContextDma;
    NvU32 addressSpace;
+    NvU64 notificationBufferSize;
    NV_STATUS status;

    hNotifier = pKernelChannel->hErrorContext;
@@ -3758,6 +3759,8 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
    NV_CHECK_OR_RETURN(LEVEL_INFO, index != NV_CHANNELGPFIFO_NOTIFICATION_TYPE_ERROR,
                     NV_ERR_INVALID_ARGUMENT);

+    notificationBufferSize = (index + 1) * sizeof(NvNotification);
+
    status = deviceGetByInstance(pClient, gpuGetDeviceInstance(pGpu), &pDevice);
    if (status != NV_OK)
        return NV_ERR_INVALID_DEVICE;
@@ -3766,7 +3769,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
    {
        addressSpace = memdescGetAddressSpace(pMemory->pMemDesc);

-        NV_CHECK_OR_RETURN(LEVEL_INFO, pMemory->Length >= ((index + 1) * sizeof(NvNotification)),
+        NV_CHECK_OR_RETURN(LEVEL_INFO, pMemory->Length >= notificationBufferSize,
                         NV_ERR_OUT_OF_RANGE);
        switch (addressSpace)
        {
@@ -3784,7 +3787,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
                                         &pDmaMappingInfo),
                    NV_ERR_GENERIC);

-                NV_CHECK_OR_RETURN(LEVEL_INFO, pDmaMappingInfo->pMemDesc->Size >= ((index + 1) * sizeof(NvNotification)),
+                NV_CHECK_OR_RETURN(LEVEL_INFO, pDmaMappingInfo->pMemDesc->Size >= notificationBufferSize,
                                 NV_ERR_OUT_OF_RANGE);
                break;
            }
@@ -3799,7 +3802,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
    }
    else if (NV_OK == ctxdmaGetByHandle(pClient, hNotifier, &pContextDma))
    {
-        NV_CHECK_OR_RETURN(LEVEL_INFO, pContextDma->Limit >= (((index + 1) * sizeof(NvNotification)) - 1),
+        NV_CHECK_OR_RETURN(LEVEL_INFO, pContextDma->Limit >= (notificationBufferSize - 1),
                         NV_ERR_OUT_OF_RANGE);
    }
    else
--- a/src/nvidia/src/kernel/gpu/gpu.c
+++ b/src/nvidia/src/kernel/gpu/gpu.c
@@ -1923,26 +1923,6 @@ gpuStatePreInit_IMPL
        }
    }

-    pGpu->boardInfo = portMemAllocNonPaged(sizeof(*pGpu->boardInfo));
-    if (pGpu->boardInfo)
-    {
-        // To avoid potential race of xid reporting with the control, zero it out
-        portMemSet(pGpu->boardInfo, '\0', sizeof(*pGpu->boardInfo));
-
-        RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
-
-        if (pRmApi->Control(pRmApi,
-                        pGpu->hInternalClient,
-                        pGpu->hInternalSubdevice,
-                        NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO,
-                        pGpu->boardInfo,
-                        sizeof(*pGpu->boardInfo)) != NV_OK)
-        {
-            portMemFree(pGpu->boardInfo);
-            pGpu->boardInfo = NULL;
-        }
-    }
-
    return rmStatus;
 }

@@ -2291,6 +2271,26 @@ gpuStatePostLoad
            goto gpuStatePostLoad_exit;
    }

+    pGpu->boardInfo = portMemAllocNonPaged(sizeof(*pGpu->boardInfo));
+    if (pGpu->boardInfo)
+    {
+        // To avoid potential race of xid reporting with the control, zero it out
+        portMemSet(pGpu->boardInfo, '\0', sizeof(*pGpu->boardInfo));
+
+        RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
+
+        if(pRmApi->Control(pRmApi,
+                           pGpu->hInternalClient,
+                           pGpu->hInternalSubdevice,
+                           NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO,
+                           pGpu->boardInfo,
+                           sizeof(*pGpu->boardInfo)) != NV_OK)
+        {
+            portMemFree(pGpu->boardInfo);
+            pGpu->boardInfo = NULL;
+        }
+    }
+
 gpuStatePostLoad_exit:
    return rmStatus;
 }
@@ -2326,6 +2326,9 @@ gpuStatePreUnload
    NvU32               curEngDescIdx;
    NV_STATUS           rmStatus = NV_OK;

+    portMemFree(pGpu->boardInfo);
+    pGpu->boardInfo = NULL;
+
    engDescriptorList = gpuGetUnloadEngineDescriptors(pGpu);
    numEngDescriptors = gpuGetNumEngDescriptors(pGpu);

@@ -2648,9 +2651,6 @@ gpuStateDestroy_IMPL
    _gpuFreeInternalObjects(pGpu);
    gpuDestroyGenericKernelFalconList(pGpu);

-    portMemFree(pGpu->boardInfo);
-    pGpu->boardInfo = NULL;
-
    portMemFree(pGpu->gspSupportedEngines);
    pGpu->gspSupportedEngines = NULL;

--- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
+++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
@@ -1047,7 +1047,7 @@ _kgspInitLibosLoggingStructures

        //
        // Setup logging memory for each task.
-        // Use MEMDESC_FLAGS_CPU_ONLY -- to early to call memdescMapIommu.
+        // Use MEMDESC_FLAGS_CPU_ONLY -- too early to call memdescMapIommu.
        //
        NV_ASSERT_OK_OR_GOTO(nvStatus,
            memdescCreate(&pLog->pTaskLogDescriptor,
@@ -1258,6 +1258,8 @@ kgspInitRm_IMPL
        return NV_ERR_INVALID_ARGUMENT;
    }

+    pKernelGsp->bInInit = NV_TRUE;
+
    // Need to hold the GPU instance lock in order to write to the RPC queue
    NV_ASSERT_OK_OR_GOTO(status,
        rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
@@ -1403,6 +1405,14 @@ kgspInitRm_IMPL
    NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspStartLogPolling(pGpu, pKernelGsp), done);

 done:
+    pKernelGsp->bInInit = NV_FALSE;
+
+    if (status != NV_OK)
+    {
+        // Preserve any captured gsp-rm logs
+        libosPreserveLogs(&pKernelGsp->logDecode);
+    }
+
    if (gpusLockedMask != 0)
    {
        rmGpuGroupLockRelease(gpusLockedMask, GPUS_LOCK_FLAGS_NONE);
@@ -1520,7 +1530,7 @@ kgspDumpGspLogs_IMPL
    NvBool bSyncNvLog
 )
 {
-    if (pKernelGsp->pLogElf || bSyncNvLog)
+    if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
        libosExtractLogs(&pKernelGsp->logDecode, bSyncNvLog);
 }

--- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c
@@ -1959,6 +1959,7 @@ memmgrFillComprInfo_IMPL
 {
    const MEMORY_SYSTEM_STATIC_CONFIG *pMemorySystemConfig =
        kmemsysGetStaticConfig(pGpu, GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu));
+    NvU32 size;

    portMemSet(pComprInfo, 0, sizeof(*pComprInfo));

@@ -1969,10 +1970,12 @@ memmgrFillComprInfo_IMPL

    NV_ASSERT(compTagStartOffset != ~(NvU32)0);

+    size = pageSize * pageCount;
+
    pComprInfo->compPageShift = pMemorySystemConfig->comprPageShift;
    pComprInfo->compTagLineMin = compTagStartOffset;
    pComprInfo->compPageIndexLo = (NvU32)(surfOffset >> pComprInfo->compPageShift);
-    pComprInfo->compPageIndexHi = (NvU32)((surfOffset + pageSize * pageCount - 1) >> pComprInfo->compPageShift);
+    pComprInfo->compPageIndexHi = (NvU32)((surfOffset + size - 1) >> pComprInfo->compPageShift);
    pComprInfo->compTagLineMultiplier = 1;

    return NV_OK;
--- a/src/nvidia/src/kernel/gpu_mgr/gpu_mgr.c
+++ b/src/nvidia/src/kernel/gpu_mgr/gpu_mgr.c
@@ -751,6 +751,8 @@ NvBool gpumgrIsDeviceRmFirmwareCapable
        0x2236, // A10   SKU215     Pris-24
        0x2237, // A10G  SKU215     Pris-24
        0x25B6, // A16
+        0x20F5, // A800-80
+        0x20F6, // A800-40
    };
    NvU32 count = NV_ARRAY_ELEMENTS(defaultGspRmGpus);
    NvU32 i;