535.43.24

2026-05-12 08:47:50 +00:00 · 2024-01-31 14:02:06 -08:00
parent 2a3b58b8c8
commit e558660fc2
267 changed files with 89045 additions and 82824 deletions
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.43.23\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.43.24\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@@ -123,6 +123,9 @@ ifneq ($(wildcard /proc/sgi_uv),)
 EXTRA_CFLAGS += -DNV_CONFIG_X86_UV
 endif

+ifdef VGX_FORCE_VFIO_PCI_CORE
+ EXTRA_CFLAGS += -DNV_VGPU_FORCE_VFIO_PCI_CORE
+endif

 #
 # The conftest.sh script tests various aspects of the target kernel.
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@@ -2067,4 +2067,7 @@ typedef enum
 #include <linux/clk-provider.h>
 #endif

+#define NV_EXPORT_SYMBOL(symbol)        EXPORT_SYMBOL_GPL(symbol)
+#define NV_CHECK_EXPORT_SYMBOL(symbol)  NV_IS_EXPORT_SYMBOL_PRESENT_##symbol
+
 #endif  /* _NV_LINUX_H_ */
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@@ -924,6 +924,7 @@ NV_STATUS  NV_API_CALL  rm_ioctl                 (nvidia_stack_t *, nv_state_t *
 NvBool     NV_API_CALL  rm_isr                   (nvidia_stack_t *, nv_state_t *, NvU32 *);
 void       NV_API_CALL  rm_isr_bh                (nvidia_stack_t *, nv_state_t *);
 void       NV_API_CALL  rm_isr_bh_unlocked       (nvidia_stack_t *, nv_state_t *);
+NvBool     NV_API_CALL  rm_is_msix_allowed       (nvidia_stack_t *, nv_state_t *);
 NV_STATUS  NV_API_CALL  rm_power_management      (nvidia_stack_t *, nv_state_t *, nv_pm_action_t);
 NV_STATUS  NV_API_CALL  rm_stop_user_channels    (nvidia_stack_t *, nv_state_t *);
 NV_STATUS  NV_API_CALL  rm_restart_user_channels (nvidia_stack_t *, nv_state_t *);
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@@ -207,9 +207,13 @@ enum os_pci_req_atomics_type {
    OS_INTF_PCIE_REQ_ATOMICS_128BIT
 };
 NV_STATUS   NV_API_CALL  os_enable_pci_req_atomics   (void *, enum os_pci_req_atomics_type);
+NV_STATUS   NV_API_CALL  os_get_numa_node_memory_usage (NvS32, NvU64 *, NvU64 *);
 NV_STATUS   NV_API_CALL  os_numa_add_gpu_memory      (void *, NvU64, NvU64, NvU32 *);
 NV_STATUS   NV_API_CALL  os_numa_remove_gpu_memory   (void *, NvU64, NvU64, NvU32); 
 NV_STATUS   NV_API_CALL  os_offline_page_at_address(NvU64 address);
+void*       NV_API_CALL  os_get_pid_info(void);
+void        NV_API_CALL  os_put_pid_info(void *pid_info);
+NV_STATUS   NV_API_CALL  os_find_ns_pid(void *pid_info, NvU32 *ns_pid);

 extern NvU32 os_page_size;
 extern NvU64 os_page_mask;
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -316,7 +316,7 @@ export_symbol_present_conftest() {
    SYMBOL="$1"
    TAB='	'

-    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL.*\$" \
+    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL\(_GPL\)\?\s*\$" \
               "$OUTPUT/Module.symvers" >/dev/null 2>&1; then
        echo "#define NV_IS_EXPORT_SYMBOL_PRESENT_$SYMBOL 1" |
            append_conftest "symbols"
@@ -337,7 +337,7 @@ export_symbol_gpl_conftest() {
    SYMBOL="$1"
    TAB='	'

-    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\$" \
+    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\s*\$" \
               "$OUTPUT/Module.symvers" >/dev/null 2>&1; then
        echo "#define NV_IS_EXPORT_SYMBOL_GPL_$SYMBOL 1" |
            append_conftest "symbols"
@@ -4468,6 +4468,24 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE" "" "types"
        ;;

+        mmu_notifier_ops_arch_invalidate_secondary_tlbs)
+            #
+            # Determine if the mmu_notifier_ops struct has the
+            # 'arch_invalidate_secondary_tlbs' member.
+            #
+            # struct mmu_notifier_ops.invalidate_range was renamed to
+            # arch_invalidate_secondary_tlbs by commit 1af5a8109904
+            # ("mmu_notifiers: rename invalidate_range notifier") due to be
+            # added in v6.6
+           CODE="
+            #include <linux/mmu_notifier.h>
+            int conftest_mmu_notifier_ops_arch_invalidate_secondary_tlbs(void) {
+                return offsetof(struct mmu_notifier_ops, arch_invalidate_secondary_tlbs);
+            }"
+
+            compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS" "" "types"
+        ;;
+
        drm_format_num_planes)
            #
            # Determine if drm_format_num_planes() function is present.
@@ -5636,23 +5654,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_GPIO_TO_IRQ_PRESENT" "" "functions"
        ;;

-        migrate_vma_setup)
-            #
-            # Determine if migrate_vma_setup() function is present
-            #
-            # migrate_vma_setup() function was added by commit
-            # a7d1f22bb74f32cf3cd93f52776007e161f1a738 ("mm: turn migrate_vma
-            # upside down) in v5.4.
-            # (2019-08-20).
-            CODE="
-            #include <linux/migrate.h>
-            int conftest_migrate_vma_setup(void) {
-                migrate_vma_setup();
-            }"
-
-            compile_check_conftest "$CODE" "NV_MIGRATE_VMA_SETUP_PRESENT" "" "functions"
-        ;;
-
        migrate_vma_added_flags)
            #
            # Determine if migrate_vma structure has flags
@@ -5743,23 +5744,25 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_IOASID_GET_PRESENT" "" "functions"
        ;;

-        mm_pasid_set)
+        mm_pasid_drop)
            #
-            # Determine if mm_pasid_set() function is present
+            # Determine if mm_pasid_drop() function is present
+            #
+            # Added by commit 701fac40384f ("iommu/sva: Assign a PASID to mm
+            # on PASID allocation and free it on mm exit") in v5.18.
+            # Moved to linux/iommu.h in commit cd3891158a77 ("iommu/sva: Move
+            # PASID helpers to sva code") in v6.4.
            #
-            # mm_pasid_set() function was added by commit
-            # 701fac40384f07197b106136012804c3cae0b3de (iommu/sva: Assign a
-            # PASID to mm on PASID allocation and free it on mm exit) in v5.18.
-            # (2022-02-15).
            CODE="
            #if defined(NV_LINUX_SCHED_MM_H_PRESENT)
            #include <linux/sched/mm.h>
            #endif
-            void conftest_mm_pasid_set(void) {
-                mm_pasid_set();
+            #include <linux/iommu.h>
+            void conftest_mm_pasid_drop(void) {
+                mm_pasid_drop();
            }"

-            compile_check_conftest "$CODE" "NV_MM_PASID_SET_PRESENT" "" "functions"
+            compile_check_conftest "$CODE" "NV_MM_PASID_DROP_PRESENT" "" "functions"
        ;;

        drm_crtc_state_has_no_vblank)
@@ -6279,6 +6282,21 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_MF_SW_SIMULATED_DEFINED" "" "types"
        ;;

+        crypto_tfm_ctx_aligned)
+            # Determine if 'crypto_tfm_ctx_aligned' is defined.
+            #
+            # Removed by commit 25c74a39e0f6 ("crypto: hmac - remove unnecessary
+            # alignment logic") in v6.7.
+            #
+            CODE="
+            #include <crypto/algapi.h>
+            void conftest_crypto_tfm_ctx_aligned(void) {
+                  (void)crypto_tfm_ctx_aligned();
+            }"
+
+            compile_check_conftest "$CODE" "NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT" "" "functions"
+        ;;
+
        crypto)
            #
            # Determine if we support various crypto functions.
@@ -6341,6 +6359,22 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MEMPOLICY_HAS_HOME_NODE" "" "types"
        ;;

+        mpol_preferred_many_present)
+            #
+            # Determine if MPOL_PREFERRED_MANY enum is present or not
+            #
+            # Added by commit b27abaccf8e8b ("mm/mempolicy: add
+            # MPOL_PREFERRED_MANY for multiple preferred nodes") in
+            # v5.15
+            #
+            CODE="
+            #include <linux/mempolicy.h>
+            int mpol_preferred_many = MPOL_PREFERRED_MANY;
+            "
+
+            compile_check_conftest "$CODE" "NV_MPOL_PREFERRED_MANY_PRESENT" "" "types"
+        ;;
+
        mmu_interval_notifier)
            #
            # Determine if mmu_interval_notifier struct is present or not
@@ -6356,6 +6390,21 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MMU_INTERVAL_NOTIFIER" "" "types"
        ;;

+        drm_unlocked_ioctl_flag_present)
+            # Determine if DRM_UNLOCKED IOCTL flag is present.
+            #
+            # DRM_UNLOCKED was removed by commit 2798ffcc1d6a ("drm: Remove
+            # locking for legacy ioctls and DRM_UNLOCKED") in Linux
+            # next-20231208.
+            CODE="
+            #if defined(NV_DRM_DRM_IOCTL_H_PRESENT)
+            #include <drm/drm_ioctl.h>
+            #endif
+            int flags = DRM_UNLOCKED;"
+
+            compile_check_conftest "$CODE" "NV_DRM_UNLOCKED_IOCTL_FLAG_PRESENT" "" "types"
+        ;;
+
        # When adding a new conftest entry, please use the correct format for
        # specifying the relevant upstream Linux kernel commit.
        #
@@ -6680,18 +6729,9 @@ case "$5" in
                VFIO_PCI_CORE_PRESENT=1
            fi

-            # When this sanity check is run via nvidia-installer, it sets ARCH as aarch64.
-            # But, when it is run via Kbuild, ARCH is set as arm64
-            if [ "$ARCH" = "aarch64" ]; then
-                ARCH="arm64"
-            fi
-
            if [ "$VFIO_IOMMU_PRESENT" != "0" ] && [ "$KVM_PRESENT" != "0" ] ; then
-
-                # On x86_64, vGPU requires MDEV framework to be present.
-                # On aarch64, vGPU requires MDEV or vfio-pci-core framework to be present.
-                if ([ "$ARCH" = "arm64" ] && ([ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ])) ||
-                   ([ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" != "0" ];) then
+                # vGPU requires either MDEV or vfio-pci-core framework to be present.
+                if [ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ]; then
                    exit 0
                fi
            fi
@@ -6702,14 +6742,10 @@ case "$5" in
                echo "CONFIG_VFIO_IOMMU_TYPE1";
            fi

-            if [ "$ARCH" = "arm64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
+            if [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
                echo "either CONFIG_VFIO_MDEV or CONFIG_VFIO_PCI_CORE";
            fi

-            if [ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ]; then
-                echo "CONFIG_VFIO_MDEV";
-            fi
-
            if [ "$KVM_PRESENT" = "0" ]; then
                echo "CONFIG_KVM";
            fi
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@@ -1312,9 +1312,21 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
 #endif

+    /*
+     * DRM_UNLOCKED is implicit for all non-legacy DRM driver IOCTLs since Linux
+     * v4.10 commit fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions"
+     * (Linux v4.4 commit ea487835e887 "drm: Enforce unlocked ioctl operation
+     * for kms driver ioctls" previously did it only for drivers that set the
+     * DRM_MODESET flag), so this will race with SET_CLIENT_CAP. Linux v4.11
+     * commit dcf727ab5d17 "drm: setclientcap doesn't need the drm BKL" also
+     * removed locking from SET_CLIENT_CAP so there is no use attempting to lock
+     * manually. The latter commit acknowledges that this can expose userspace
+     * to inconsistent behavior when racing with itself, but accepts that risk.
+     */
    DRM_IOCTL_DEF_DRV(NVIDIA_GET_CLIENT_CAPABILITY,
                      nv_drm_get_client_capability_ioctl,
                      0),
+
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
    DRM_IOCTL_DEF_DRV(NVIDIA_GET_CRTC_CRC32,
                      nv_drm_get_crtc_crc32_ioctl,
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
@@ -243,6 +243,15 @@ static int __nv_drm_nvkms_gem_obj_init(
    NvU64 *pages = NULL;
    NvU32 numPages = 0;

+    if ((size % PAGE_SIZE) != 0) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "NvKmsKapiMemory 0x%p size should be in a multiple of page size to "
+            "create a gem object",
+            pMemory);
+        return -EINVAL;
+    }
+
    nv_nvkms_memory->pPhysicalAddress = NULL;
    nv_nvkms_memory->pWriteCombinedIORemapAddress = NULL;
    nv_nvkms_memory->physically_mapped = false;
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.h
@@ -582,6 +582,19 @@ static inline int nv_drm_format_num_planes(uint32_t format)

 #endif /* defined(NV_DRM_FORMAT_MODIFIERS_PRESENT) */

+/*
+ * DRM_UNLOCKED was removed with linux-next commit 2798ffcc1d6a ("drm: Remove
+ * locking for legacy ioctls and DRM_UNLOCKED"), but it was previously made
+ * implicit for all non-legacy DRM driver IOCTLs since Linux v4.10 commit
+ * fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions" (Linux v4.4
+ * commit ea487835e887 "drm: Enforce unlocked ioctl operation for kms driver
+ * ioctls" previously did it only for drivers that set the DRM_MODESET flag), so
+ * it was effectively a no-op anyway.
+ */
+#if !defined(NV_DRM_UNLOCKED_IOCTL_FLAG_PRESENT)
+#define DRM_UNLOCKED 0
+#endif
+
 /*
 * drm_vma_offset_exact_lookup_locked() were added
 * by kernel commit 2225cfe46bcc which was Signed-off-by:
--- a/kernel-open/nvidia-drm/nvidia-drm.Kbuild
+++ b/kernel-open/nvidia-drm/nvidia-drm.Kbuild
@@ -133,3 +133,4 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_lookup
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_put
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_dumb_destroy
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@@ -68,6 +68,9 @@ module_param_named(output_rounding_fix, output_rounding_fix, bool, 0400);
 static bool disable_vrr_memclk_switch = false;
 module_param_named(disable_vrr_memclk_switch, disable_vrr_memclk_switch, bool, 0400);

+static bool opportunistic_display_sync = true;
+module_param_named(opportunistic_display_sync, opportunistic_display_sync, bool, 0400);
+
 /* These parameters are used for fault injection tests.  Normally the defaults
 * should be used. */
 MODULE_PARM_DESC(fail_malloc, "Fail the Nth call to nvkms_alloc");
@@ -99,6 +102,11 @@ NvBool nvkms_disable_vrr_memclk_switch(void)
    return disable_vrr_memclk_switch;
 }

+NvBool nvkms_opportunistic_display_sync(void)
+{
+    return opportunistic_display_sync;
+}
+
 #define NVKMS_SYNCPT_STUBS_NEEDED

 /*************************************************************************
@@ -200,9 +208,23 @@ static inline int nvkms_read_trylock_pm_lock(void)

 static inline void nvkms_read_lock_pm_lock(void)
 {
-    while (!down_read_trylock(&nvkms_pm_lock)) {
-        try_to_freeze();
-        cond_resched();
+    if ((current->flags & PF_NOFREEZE)) {
+        /*
+         * Non-freezable tasks (i.e. kthreads in this case) don't have to worry
+         * about being frozen during system suspend, but do need to block so
+         * that the CPU can go idle during s2idle. Do a normal uninterruptible
+         * blocking wait for the PM lock.
+         */
+        down_read(&nvkms_pm_lock);
+    } else {
+        /*
+         * For freezable tasks, make sure we give the kernel an opportunity to
+         * freeze if taking the PM lock fails.
+         */
+        while (!down_read_trylock(&nvkms_pm_lock)) {
+            try_to_freeze();
+            cond_resched();
+        }
    }
 }

--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@@ -99,6 +99,7 @@ typedef struct {
 NvBool nvkms_output_rounding_fix(void);

 NvBool nvkms_disable_vrr_memclk_switch(void);
+NvBool nvkms_opportunistic_display_sync(void);

 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
--- a/kernel-open/nvidia-peermem/nvidia-peermem.c
+++ b/kernel-open/nvidia-peermem/nvidia-peermem.c
@@ -1,20 +1,25 @@
-/* SPDX-License-Identifier: Linux-OpenIB */
 /*
 * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
 *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
 *
- *  - Redistributions of source code must retain the above
- *    copyright notice, this list of conditions and the following
- *    disclaimer.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
 *
- *  - Redistributions in binary form must reproduce the above
- *    copyright notice, this list of conditions and the following
- *    disclaimer in the documentation and/or other materials
- *    provided with the distribution.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
@@ -43,7 +48,9 @@

 MODULE_AUTHOR("Yishai Hadas");
 MODULE_DESCRIPTION("NVIDIA GPU memory plug-in");
-MODULE_LICENSE("Linux-OpenIB");
+
+MODULE_LICENSE("Dual BSD/GPL");
+
 MODULE_VERSION(DRV_VERSION);
 enum {
        NV_MEM_PEERDIRECT_SUPPORT_DEFAULT = 0,
@@ -53,7 +60,13 @@ static int peerdirect_support = NV_MEM_PEERDIRECT_SUPPORT_DEFAULT;
 module_param(peerdirect_support, int, S_IRUGO);
 MODULE_PARM_DESC(peerdirect_support, "Set level of support for Peer-direct, 0 [default] or 1 [legacy, for example MLNX_OFED 4.9 LTS]");

-#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d " FMT, __FUNCTION__, __LINE__, ## ARGS)
+
+#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d ERROR " FMT, __FUNCTION__, __LINE__, ## ARGS)
+#ifdef NV_MEM_DEBUG
+#define peer_trace(FMT, ARGS...) printk(KERN_DEBUG "nvidia-peermem" " %s:%d TRACE " FMT, __FUNCTION__, __LINE__, ## ARGS)
+#else
+#define peer_trace(FMT, ARGS...) do {} while (0)
+#endif

 #if defined(NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)

@@ -74,7 +87,10 @@ invalidate_peer_memory mem_invalidate_callback;
 static void *reg_handle = NULL;
 static void *reg_handle_nc = NULL;

+#define NV_MEM_CONTEXT_MAGIC ((u64)0xF1F4F1D0FEF0DAD0ULL)
+
 struct nv_mem_context {
+    u64 pad1;
    struct nvidia_p2p_page_table *page_table;
    struct nvidia_p2p_dma_mapping *dma_mapping;
    u64 core_context;
@@ -86,8 +102,22 @@ struct nv_mem_context {
    struct task_struct *callback_task;
    int sg_allocated;
    struct sg_table sg_head;
+    u64 pad2;
 };

+#define NV_MEM_CONTEXT_CHECK_OK(MC) ({                                  \
+    struct nv_mem_context *mc = (MC);                                   \
+    int rc = ((0 != mc) &&                                              \
+              (READ_ONCE(mc->pad1) == NV_MEM_CONTEXT_MAGIC) &&          \
+              (READ_ONCE(mc->pad2) == NV_MEM_CONTEXT_MAGIC));           \
+    if (!rc) {                                                          \
+        peer_trace("invalid nv_mem_context=%px pad1=%016llx pad2=%016llx\n", \
+                   mc,                                                  \
+                   mc?mc->pad1:0,                                       \
+                   mc?mc->pad2:0);                                      \
+    }                                                                   \
+    rc;                                                                 \
+})

 static void nv_get_p2p_free_callback(void *data)
 {
@@ -97,8 +127,9 @@ static void nv_get_p2p_free_callback(void *data)
    struct nvidia_p2p_dma_mapping *dma_mapping = NULL;

    __module_get(THIS_MODULE);
-    if (!nv_mem_context) {
-        peer_err("nv_get_p2p_free_callback -- invalid nv_mem_context\n");
+
+    if (!NV_MEM_CONTEXT_CHECK_OK(nv_mem_context)) {
+        peer_err("detected invalid context, skipping further processing\n");
        goto out;
    }

@@ -169,9 +200,11 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
        /* Error case handled as not mine */
        return 0;

+    nv_mem_context->pad1 = NV_MEM_CONTEXT_MAGIC;
    nv_mem_context->page_virt_start = addr & GPU_PAGE_MASK;
    nv_mem_context->page_virt_end   = (addr + size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
    nv_mem_context->mapped_size  = nv_mem_context->page_virt_end - nv_mem_context->page_virt_start;
+    nv_mem_context->pad2 = NV_MEM_CONTEXT_MAGIC;

    ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
                               &nv_mem_context->page_table, nv_mem_dummy_callback, nv_mem_context);
@@ -195,6 +228,7 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
    return 1;

 err:
+    memset(nv_mem_context, 0, sizeof(*nv_mem_context));
    kfree(nv_mem_context);

    /* Error case handled as not mine */
@@ -342,6 +376,7 @@ static void nv_mem_release(void *context)
        sg_free_table(&nv_mem_context->sg_head);
        nv_mem_context->sg_allocated = 0;
    }
+    memset(nv_mem_context, 0, sizeof(*nv_mem_context));
    kfree(nv_mem_context);
    module_put(THIS_MODULE);
    return;
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@@ -81,8 +81,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
@@ -100,6 +99,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
 NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
+NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
 NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
@@ -110,6 +110,8 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_mm_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_pt_regs_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_unified_nodes
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_home_node
+NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup
--- a/kernel-open/nvidia-uvm/uvm.c
+++ b/kernel-open/nvidia-uvm/uvm.c
@@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
 static void uvm_vm_close_managed(struct vm_area_struct *vma)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_processor_id_t gpu_id;
    bool make_zombie = false;

    if (current->mm != NULL)
@@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)

    uvm_destroy_vma_managed(vma, make_zombie);

-    // Notify GPU address spaces that the fault buffer needs to be flushed to
-    // avoid finding stale entries that can be attributed to new VA ranges
-    // reallocated at the same address.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
    uvm_va_space_up_write(va_space);

    if (current->mm != NULL)
--- a/kernel-open/nvidia-uvm/uvm_ada.c
+++ b/kernel-open/nvidia-uvm/uvm_ada.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2021-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -94,4 +94,6 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->map_remap_larger_page_promotion = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_ampere.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere.c
@@ -101,4 +101,6 @@ void uvm_hal_ampere_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
        parent_gpu->map_remap_larger_page_promotion = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018 NVIDIA Corporation
+    Copyright (c) 2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -107,10 +107,10 @@ static NV_STATUS service_ats_faults(uvm_gpu_va_space_t *gpu_va_space,
    return status;
 }

-static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
-                                   NvU64 addr,
-                                   size_t size,
-                                   uvm_fault_client_type_t client_type)
+static void flush_tlb_va_region(uvm_gpu_va_space_t *gpu_va_space,
+                                NvU64 addr,
+                                size_t size,
+                                uvm_fault_client_type_t client_type)
 {
    uvm_ats_fault_invalidate_t *ats_invalidate;

@@ -119,12 +119,12 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
    else
        ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.non_replayable.ats_invalidate;

-    if (!ats_invalidate->write_faults_in_batch) {
-        uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
-        ats_invalidate->write_faults_in_batch = true;
+    if (!ats_invalidate->tlb_batch_pending) {
+        uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->tlb_batch);
+        ats_invalidate->tlb_batch_pending = true;
    }

-    uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
+    uvm_tlb_batch_invalidate(&ats_invalidate->tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
 }

 static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,
@@ -149,7 +149,11 @@ static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,

    mode = vma_policy->mode;

-    if ((mode == MPOL_BIND) || (mode == MPOL_PREFERRED_MANY) || (mode == MPOL_PREFERRED)) {
+    if ((mode == MPOL_BIND)
+#if defined(NV_MPOL_PREFERRED_MANY_PRESENT)
+         || (mode == MPOL_PREFERRED_MANY)
+#endif
+         || (mode == MPOL_PREFERRED)) {
        int home_node = NUMA_NO_NODE;

 #if defined(NV_MEMPOLICY_HAS_HOME_NODE)
@@ -467,6 +471,10 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
            uvm_page_mask_and(write_fault_mask, write_fault_mask, read_fault_mask);
        else
            uvm_page_mask_zero(write_fault_mask);
+
+        // There are no pending faults beyond write faults to RO region.
+        if (uvm_page_mask_empty(read_fault_mask))
+            return status;
    }

    ats_batch_select_residency(gpu_va_space, vma, ats_context);
@@ -489,6 +497,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,

        if (vma->vm_flags & VM_WRITE) {
            uvm_page_mask_region_fill(faults_serviced_mask, subregion);
+            uvm_ats_smmu_invalidate_tlbs(gpu_va_space, start, length);

            // The Linux kernel never invalidates TLB entries on mapping
            // permission upgrade. This is a problem if the GPU has cached
@@ -499,7 +508,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
            // infinite loop because we just forward the fault to the Linux
            // kernel and it will see that the permissions in the page table are
            // correct. Therefore, we flush TLB entries on ATS write faults.
-            flush_tlb_write_faults(gpu_va_space, start, length, client_type);
+            flush_tlb_va_region(gpu_va_space, start, length, client_type);
        }
        else {
            uvm_page_mask_region_fill(reads_serviced_mask, subregion);
@@ -522,6 +531,15 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
            return status;

        uvm_page_mask_region_fill(faults_serviced_mask, subregion);
+
+        // Similarly to permission upgrade scenario, discussed above, GPU
+        // will not re-fetch the entry if the PTE is invalid and page size
+        // is 4K. To avoid infinite faulting loop, invalidate TLB for every
+        // new translation written explicitly like in the case of permission
+        // upgrade.
+        if (PAGE_SIZE == UVM_PAGE_SIZE_4K)
+            flush_tlb_va_region(gpu_va_space, start, length, client_type);
+
    }

    return status;
@@ -556,7 +574,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
    NV_STATUS status;
    uvm_push_t push;

-    if (!ats_invalidate->write_faults_in_batch)
+    if (!ats_invalidate->tlb_batch_pending)
        return NV_OK;

    UVM_ASSERT(gpu_va_space);
@@ -568,7 +586,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
                            "Invalidate ATS entries");

    if (status == NV_OK) {
-        uvm_tlb_batch_end(&ats_invalidate->write_faults_tlb_batch, &push, UVM_MEMBAR_NONE);
+        uvm_tlb_batch_end(&ats_invalidate->tlb_batch, &push, UVM_MEMBAR_NONE);
        uvm_push_end(&push);

        // Add this push to the GPU's tracker so that fault replays/clears can
@@ -576,8 +594,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
        status = uvm_tracker_add_push_safe(out_tracker, &push);
    }

-    ats_invalidate->write_faults_in_batch = false;
+    ats_invalidate->tlb_batch_pending = false;

    return status;
 }
-
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.h
@@ -52,7 +52,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
 bool uvm_ats_check_in_gmmu_region(uvm_va_space_t *va_space, NvU64 address, uvm_va_range_t *next);

 // This function performs pending TLB invalidations for ATS and clears the
-// ats_invalidate->write_faults_in_batch flag
+// ats_invalidate->tlb_batch_pending flag
 NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
                                  uvm_ats_fault_invalidate_t *ats_invalidate,
                                  uvm_tracker_t *out_tracker);
--- a/kernel-open/nvidia-uvm/uvm_ats_sva.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.c
@@ -29,8 +29,13 @@
 #include "uvm_va_space.h"
 #include "uvm_va_space_mm.h"

+#include <asm/io.h>
+#include <linux/log2.h>
 #include <linux/iommu.h>
 #include <linux/mm_types.h>
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/mmu_context.h>

 // linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
 // reference required for the iommu_sva_bind_device() call. This header is not
@@ -46,17 +51,276 @@
 #define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
 #endif

+// Type to represent a 128-bit SMMU command queue command.
+struct smmu_cmd {
+    NvU64 low;
+    NvU64 high;
+};
+
+// Base address of SMMU CMDQ-V for GSMMU0.
+#define SMMU_CMDQV_BASE_ADDR(smmu_base) (smmu_base + 0x200000)
+#define SMMU_CMDQV_BASE_LEN 0x00830000
+
+// CMDQV configuration is done by firmware but we check status here.
+#define SMMU_CMDQV_CONFIG 0x0
+#define SMMU_CMDQV_CONFIG_CMDQV_EN BIT(0)
+
+// Used to map a particular VCMDQ to a VINTF.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP(vcmdq_id) (0x200 + 0x4 * (vcmdq_id))
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC BIT(0)
+
+// Shift for the field containing the index of the virtual interface
+// owning the VCMDQ.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT 15
+
+// Base address for the VINTF registers.
+#define SMMU_VINTF_BASE_ADDR(cmdqv_base_addr, vintf_id) (cmdqv_base_addr + 0x1000 + 0x100 * (vintf_id))
+
+// Virtual interface (VINTF) configuration registers. The WAR only
+// works on baremetal so we need to configure ourselves as the
+// hypervisor owner.
+#define SMMU_VINTF_CONFIG 0x0
+#define SMMU_VINTF_CONFIG_ENABLE BIT(0)
+#define SMMU_VINTF_CONFIG_HYP_OWN BIT(17)
+
+#define SMMU_VINTF_STATUS 0x0
+#define SMMU_VINTF_STATUS_ENABLED BIT(0)
+
+// Caclulates the base address for a particular VCMDQ instance.
+#define SMMU_VCMDQ_BASE_ADDR(cmdqv_base_addr, vcmdq_id) (cmdqv_base_addr + 0x10000 + 0x80 * (vcmdq_id))
+
+// SMMU command queue consumer index register. Updated by SMMU
+// when commands are consumed.
+#define SMMU_VCMDQ_CONS 0x0
+
+// SMMU command queue producer index register. Updated by UVM when
+// commands are added to the queue.
+#define SMMU_VCMDQ_PROD 0x4
+
+// Configuration register used to enable a VCMDQ.
+#define SMMU_VCMDQ_CONFIG 0x8
+#define SMMU_VCMDQ_CONFIG_ENABLE BIT(0)
+
+// Status register used to check the VCMDQ is enabled.
+#define SMMU_VCMDQ_STATUS 0xc
+#define SMMU_VCMDQ_STATUS_ENABLED BIT(0)
+
+// Base address offset for the VCMDQ registers.
+#define SMMU_VCMDQ_CMDQ_BASE 0x10000
+
+// Size of the command queue. Each command is 16 bytes and we can't
+// have a command queue greater than one page in size.
+#define SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE (PAGE_SHIFT - ilog2(sizeof(struct smmu_cmd)))
+#define SMMU_VCMDQ_CMDQ_ENTRIES (1UL << SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE)
+
+// We always use VINTF63 for the WAR
+#define VINTF 63
+static void smmu_vintf_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+// We always use VCMDQ127 for the WAR
+#define VCMDQ 127
+void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+static void smmu_vcmdq_write64(void __iomem *smmu_cmdqv_base, int reg, NvU64 val)
+{
+    iowrite64(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+// TLB invalidates on read-only to read-write upgrades
+static NV_STATUS uvm_ats_smmu_war_init(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_spin_loop_t spin;
+    NV_STATUS status;
+    unsigned long cmdqv_config;
+    void __iomem *smmu_cmdqv_base;
+    struct acpi_iort_node *node;
+    struct acpi_iort_smmu_v3 *iort_smmu;
+
+    node = *(struct acpi_iort_node **) dev_get_platdata(parent_gpu->pci_dev->dev.iommu->iommu_dev->dev->parent);
+    iort_smmu = (struct acpi_iort_smmu_v3 *) node->node_data;
+
+    smmu_cmdqv_base = ioremap(SMMU_CMDQV_BASE_ADDR(iort_smmu->base_address), SMMU_CMDQV_BASE_LEN);
+    if (!smmu_cmdqv_base)
+        return NV_ERR_NO_MEMORY;
+
+    parent_gpu->smmu_war.smmu_cmdqv_base = smmu_cmdqv_base;
+    cmdqv_config = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CONFIG);
+    if (!(cmdqv_config & SMMU_CMDQV_CONFIG_CMDQV_EN)) {
+        status = NV_ERR_OBJECT_NOT_FOUND;
+        goto out;
+    }
+
+    // Allocate SMMU CMDQ pages for WAR
+    parent_gpu->smmu_war.smmu_cmdq = alloc_page(NV_UVM_GFP_FLAGS | __GFP_ZERO);
+    if (!parent_gpu->smmu_war.smmu_cmdq) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    // Initialise VINTF for the WAR
+    smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, SMMU_VINTF_CONFIG_ENABLE | SMMU_VINTF_CONFIG_HYP_OWN);
+    UVM_SPIN_WHILE(!(smmu_vintf_read32(smmu_cmdqv_base, SMMU_VINTF_STATUS) & SMMU_VINTF_STATUS_ENABLED), &spin);
+
+    // Allocate VCMDQ to VINTF
+    iowrite32((VINTF << SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT) | SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC,
+              smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+
+    smmu_vcmdq_write64(smmu_cmdqv_base, SMMU_VCMDQ_CMDQ_BASE,
+                       page_to_phys(parent_gpu->smmu_war.smmu_cmdq) | SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONS, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_PROD, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, SMMU_VCMDQ_CONFIG_ENABLE);
+    UVM_SPIN_WHILE(!(smmu_vcmdq_read32(smmu_cmdqv_base, SMMU_VCMDQ_STATUS) & SMMU_VCMDQ_STATUS_ENABLED), &spin);
+
+    uvm_mutex_init(&parent_gpu->smmu_war.smmu_lock, UVM_LOCK_ORDER_LEAF);
+    parent_gpu->smmu_war.smmu_prod = 0;
+    parent_gpu->smmu_war.smmu_cons = 0;
+
+    return NV_OK;
+
+out:
+    iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+    parent_gpu->smmu_war.smmu_cmdqv_base = NULL;
+
+    return status;
+}
+
+static void uvm_ats_smmu_war_deinit(uvm_parent_gpu_t *parent_gpu)
+{
+    void __iomem *smmu_cmdqv_base = parent_gpu->smmu_war.smmu_cmdqv_base;
+    NvU32 cmdq_alloc_map;
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base) {
+        smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, 0);
+        cmdq_alloc_map = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        iowrite32(cmdq_alloc_map & SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC, smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, 0);
+    }
+
+    if (parent_gpu->smmu_war.smmu_cmdq)
+        __free_page(parent_gpu->smmu_war.smmu_cmdq);
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base)
+        iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+}
+
+// The SMMU on ARM64 can run under different translation regimes depending on
+// what features the OS and CPU variant support. The CPU for GH180 supports
+// virtualisation extensions and starts the kernel at EL2 meaning SMMU operates
+// under the NS-EL2-E2H translation regime. Therefore we need to use the
+// TLBI_EL2_* commands which invalidate TLB entries created under this
+// translation regime.
+#define CMDQ_OP_TLBI_EL2_ASID 0x21;
+#define CMDQ_OP_TLBI_EL2_VA 0x22;
+#define CMDQ_OP_CMD_SYNC 0x46
+
+// Use the same maximum as used for MAX_TLBI_OPS in the upstream
+// kernel.
+#define UVM_MAX_TLBI_OPS (1UL << (PAGE_SHIFT - 3))
+
+#if UVM_ATS_SMMU_WAR_REQUIRED()
+void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+{
+    struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
+    uvm_parent_gpu_t *parent_gpu = gpu_va_space->gpu->parent;
+    struct {
+        NvU64 low;
+        NvU64 high;
+    } *vcmdq;
+    unsigned long vcmdq_prod;
+    NvU64 end;
+    uvm_spin_loop_t spin;
+    NvU16 asid;
+
+    if (!parent_gpu->smmu_war.smmu_cmdqv_base)
+        return;
+
+    asid = arm64_mm_context_get(mm);
+    vcmdq = kmap(parent_gpu->smmu_war.smmu_cmdq);
+    uvm_mutex_lock(&parent_gpu->smmu_war.smmu_lock);
+    vcmdq_prod = parent_gpu->smmu_war.smmu_prod;
+
+    // Our queue management is very simple. The mutex prevents multiple
+    // producers writing to the queue and all our commands require waiting for
+    // the queue to drain so we know it's empty. If we can't fit enough commands
+    // in the queue we just invalidate the whole ASID.
+    //
+    // The command queue is a cirular buffer with the MSB representing a wrap
+    // bit that must toggle on each wrap. See the SMMU architecture
+    // specification for more details.
+    //
+    // SMMU_VCMDQ_CMDQ_ENTRIES - 1 because we need to leave space for the
+    // CMD_SYNC.
+    if ((size >> PAGE_SHIFT) > min(UVM_MAX_TLBI_OPS, SMMU_VCMDQ_CMDQ_ENTRIES - 1)) {
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_ASID;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0;
+        vcmdq_prod++;
+    }
+    else {
+        for (end = addr + size; addr < end; addr += PAGE_SIZE) {
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_VA;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = addr & ~((1UL << 12) - 1);
+            vcmdq_prod++;
+        }
+    }
+
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_CMD_SYNC;
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0x0;
+    vcmdq_prod++;
+
+    // MSB is the wrap bit
+    vcmdq_prod &= (1UL << (SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 1)) - 1;
+    parent_gpu->smmu_war.smmu_prod = vcmdq_prod;
+    smmu_vcmdq_write32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_PROD, parent_gpu->smmu_war.smmu_prod);
+
+    UVM_SPIN_WHILE(
+        (smmu_vcmdq_read32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_CONS) & GENMASK(19, 0)) != vcmdq_prod,
+        &spin);
+
+    uvm_mutex_unlock(&parent_gpu->smmu_war.smmu_lock);
+    kunmap(parent_gpu->smmu_war.smmu_cmdq);
+    arm64_mm_context_put(mm);
+}
+#endif
+
 NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
 {
    int ret;

    ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
+    if (ret)
+        return errno_to_nv_status(ret);

-    return errno_to_nv_status(ret);
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        return uvm_ats_smmu_war_init(parent_gpu);
+    else
+        return NV_OK;
 }

 void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
 {
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        uvm_ats_smmu_war_deinit(parent_gpu);
+
    iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
 }

--- a/kernel-open/nvidia-uvm/uvm_ats_sva.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.h
@@ -32,23 +32,38 @@
 // For ATS support on aarch64, arm_smmu_sva_bind() is needed for
 // iommu_sva_bind_device() calls. Unfortunately, arm_smmu_sva_bind() is not
 // conftest-able. We instead look for the presence of ioasid_get() or
-// mm_pasid_set(). ioasid_get() was added in the same patch series as
-// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_set() was added in the
+// mm_pasid_drop(). ioasid_get() was added in the same patch series as
+// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_drop() was added in the
 // same patch as the removal of ioasid_get(). We assume the presence of
-// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_set(v5.18+) is
+// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_drop(v5.18+) is
 // present.
 //
 // arm_smmu_sva_bind() was added with commit
 // 32784a9562fb0518b12e9797ee2aec52214adf6f and ioasid_get() was added with
 // commit cb4789b0d19ff231ce9f73376a023341300aed96 (11/23/2020). Commit
 // 701fac40384f07197b106136012804c3cae0b3de (02/15/2022) removed ioasid_get()
-// and added mm_pasid_set().
-    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_SET_PRESENT))
-        #define UVM_ATS_SVA_SUPPORTED() 1
+// and added mm_pasid_drop().
+    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT))
+        #if defined(CONFIG_IOMMU_SVA)
+            #define UVM_ATS_SVA_SUPPORTED() 1
+        #else
+            #define UVM_ATS_SVA_SUPPORTED() 0
+        #endif
    #else
        #define UVM_ATS_SVA_SUPPORTED() 0
    #endif

+// If NV_ARCH_INVALIDATE_SECONDARY_TLBS is defined it means the upstream fix is
+// in place so no need for the WAR from Bug 4130089: [GH180][r535] WAR for
+// kernel not issuing SMMU TLB invalidates on read-only
+#if defined(NV_ARCH_INVALIDATE_SECONDARY_TLBS)
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#elif NVCPU_IS_AARCH64
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 1
+#else
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#endif
+
 typedef struct
 {
    int placeholder;
@@ -77,6 +92,17 @@ typedef struct

    // LOCKING: None
    void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
+
+    // Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+    // TLB invalidates on read-only to read-write upgrades
+    #if UVM_ATS_SMMU_WAR_REQUIRED()
+        void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size);
+    #else
+        static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+        {
+
+        }
+    #endif
 #else
    static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
    {
@@ -107,6 +133,11 @@ typedef struct
    {

    }
+
+    static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+    {
+
+    }
 #endif // UVM_ATS_SVA_SUPPORTED

 #endif // __UVM_ATS_SVA_H__
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@@ -191,7 +191,7 @@ static NV_STATUS test_membar(uvm_gpu_t *gpu)

    for (i = 0; i < REDUCTIONS; ++i) {
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS + 1);
+        gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS);
    }

    // Without a sys membar the channel tracking semaphore can and does complete
@@ -577,7 +577,7 @@ static NV_STATUS test_semaphore_reduction_inc(uvm_gpu_t *gpu)

    for (i = 0; i < REDUCTIONS; i++) {
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, i+1);
+        gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, REDUCTIONS);
    }

    status = uvm_push_end_and_wait(&push);
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@@ -21,8 +21,8 @@

 *******************************************************************************/

-#ifndef _UVM_COMMON_H
-#define _UVM_COMMON_H
+#ifndef __UVM_COMMON_H__
+#define __UVM_COMMON_H__

 #ifdef DEBUG
    #define UVM_IS_DEBUG() 1
@@ -413,4 +413,40 @@ static inline void uvm_touch_page(struct page *page)
 // Return true if the VMA is one used by UVM managed allocations.
 bool uvm_vma_is_managed(struct vm_area_struct *vma);

-#endif /* _UVM_COMMON_H */
+static bool uvm_platform_uses_canonical_form_address(void)
+{
+    if (NVCPU_IS_PPC64LE)
+        return false;
+
+    return true;
+}
+
+// Similar to the GPU MMU HAL num_va_bits(), it returns the CPU's num_va_bits().
+static NvU32 uvm_cpu_num_va_bits(void)
+{
+    return fls64(TASK_SIZE - 1) + 1;
+}
+
+// Return the unaddressable range in a num_va_bits-wide VA space, [first, outer)
+static void uvm_get_unaddressable_range(NvU32 num_va_bits, NvU64 *first, NvU64 *outer)
+{
+    UVM_ASSERT(num_va_bits < 64);
+    UVM_ASSERT(first);
+    UVM_ASSERT(outer);
+
+    if (uvm_platform_uses_canonical_form_address()) {
+        *first = 1ULL << (num_va_bits - 1);
+        *outer = (NvU64)((NvS64)(1ULL << 63) >> (64 - num_va_bits));
+    }
+    else {
+        *first = 1ULL << num_va_bits;
+        *outer = ~0Ull;
+    }
+}
+
+static void uvm_cpu_get_unaddressable_range(NvU64 *first, NvU64 *outer)
+{
+    return uvm_get_unaddressable_range(uvm_cpu_num_va_bits(), first, outer);
+}
+
+#endif /* __UVM_COMMON_H__ */
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@@ -218,19 +218,12 @@ static bool gpu_supports_uvm(uvm_parent_gpu_t *parent_gpu)
    return parent_gpu->rm_info.subdeviceCount == 1;
 }

-static bool platform_uses_canonical_form_address(void)
-{
-    if (NVCPU_IS_PPC64LE)
-        return false;
-
-    return true;
-}
-
 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
 {
    // Lower and upper address spaces are typically found in platforms that use
    // the canonical address form.
    NvU64 max_va_lower;
+    NvU64 min_va_upper;
    NvU64 addr_end = addr + size - 1;
    NvU8 gpu_addr_shift;
    NvU8 cpu_addr_shift;
@@ -243,7 +236,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    UVM_ASSERT(size > 0);

    gpu_addr_shift = gpu->address_space_tree.hal->num_va_bits();
-    cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
+    cpu_addr_shift = uvm_cpu_num_va_bits();
    addr_shift = gpu_addr_shift;

    // Pascal+ GPUs are capable of accessing kernel pointers in various modes
@@ -279,9 +272,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    //               0 +----------------+               0 +----------------+

    // On canonical form address platforms and Pascal+ GPUs.
-    if (platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
-        NvU64 min_va_upper;
-
+    if (uvm_platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
        // On x86, when cpu_addr_shift > gpu_addr_shift, it means the CPU uses
        // 5-level paging and the GPU is pre-Hopper. On Pascal-Ada GPUs (49b
        // wide VA) we set addr_shift to match a 4-level paging x86 (48b wide).
@@ -292,15 +283,11 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
            addr_shift = gpu_addr_shift;
        else
            addr_shift = cpu_addr_shift;
+    }

-        min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - addr_shift));
-        max_va_lower = 1ULL << (addr_shift - 1);
-        return (addr_end < max_va_lower) || (addr >= min_va_upper);
-    }
-    else {
-        max_va_lower = 1ULL << addr_shift;
-        return addr_end < max_va_lower;
-    }
+    uvm_get_unaddressable_range(addr_shift, &max_va_lower, &min_va_upper);
+
+    return (addr_end < max_va_lower) || (addr >= min_va_upper);
 }

 // The internal UVM VAS does not use canonical form addresses.
@@ -326,14 +313,14 @@ NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr)
    NvU8 addr_shift;
    NvU64 input_addr = addr;

-    if (platform_uses_canonical_form_address()) {
+    if (uvm_platform_uses_canonical_form_address()) {
        // When the CPU VA width is larger than GPU's, it means that:
        // On ARM: the CPU is on LVA mode and the GPU is pre-Hopper.
        // On x86: the CPU uses 5-level paging and the GPU is pre-Hopper.
        // We sign-extend on the 48b on ARM and on the 47b on x86 to mirror the
        // behavior of CPUs with smaller (than GPU) VA widths.
        gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
-        cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
+        cpu_addr_shift = uvm_cpu_num_va_bits();

        if (cpu_addr_shift > gpu_addr_shift)
            addr_shift = NVCPU_IS_X86_64 ? 48 : 49;
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -57,14 +57,16 @@

 typedef struct
 {
-    // Number of faults from this uTLB that have been fetched but have not been serviced yet
+    // Number of faults from this uTLB that have been fetched but have not been
+    // serviced yet.
    NvU32 num_pending_faults;

    // Whether the uTLB contains fatal faults
    bool has_fatal_faults;

-    // We have issued a replay of type START_ACK_ALL while containing fatal faults. This puts
-    // the uTLB in lockdown mode and no new translations are accepted
+    // We have issued a replay of type START_ACK_ALL while containing fatal
+    // faults. This puts the uTLB in lockdown mode and no new translations are
+    // accepted.
    bool in_lockdown;

    // We have issued a cancel on this uTLB
@@ -126,8 +128,8 @@ struct uvm_service_block_context_struct
        struct list_head service_context_list;

        // A mask of GPUs that need to be checked for ECC errors before the CPU
-        // fault handler returns, but after the VA space lock has been unlocked to
-        // avoid the RM/UVM VA space lock deadlocks.
+        // fault handler returns, but after the VA space lock has been unlocked
+        // to avoid the RM/UVM VA space lock deadlocks.
        uvm_processor_mask_t gpus_to_check_for_ecc;

        // This is set to throttle page fault thrashing.
@@ -160,9 +162,9 @@ struct uvm_service_block_context_struct

    struct
    {
-        // Per-processor mask with the pages that will be resident after servicing.
-        // We need one mask per processor because we may coalesce faults that
-        // trigger migrations to different processors.
+        // Per-processor mask with the pages that will be resident after
+        // servicing. We need one mask per processor because we may coalesce
+        // faults that trigger migrations to different processors.
        uvm_page_mask_t new_residency;
    } per_processor_masks[UVM_ID_MAX_PROCESSORS];

@@ -263,7 +265,10 @@ struct uvm_fault_service_batch_context_struct

    NvU32 num_coalesced_faults;

-    bool has_fatal_faults;
+    // One of the VA spaces in this batch which had fatal faults. If NULL, no
+    // faults were fatal. More than one VA space could have fatal faults, but we
+    // pick one to be the target of the cancel sequence.
+    uvm_va_space_t *fatal_va_space;

    bool has_throttled_faults;

@@ -291,11 +296,8 @@ struct uvm_fault_service_batch_context_struct

 struct uvm_ats_fault_invalidate_struct
 {
-    // Whether the TLB batch contains any information
-    bool            write_faults_in_batch;
-
-    // Batch of TLB entries to be invalidated
-    uvm_tlb_batch_t write_faults_tlb_batch;
+    bool            tlb_batch_pending;
+    uvm_tlb_batch_t tlb_batch;
 };

 typedef struct
@@ -440,20 +442,9 @@ struct uvm_access_counter_service_batch_context_struct
        NvU32                             num_notifications;

        // Boolean used to avoid sorting the fault batch by instance_ptr if we
-        // determine at fetch time that all the access counter notifications in the
-        // batch report the same instance_ptr
+        // determine at fetch time that all the access counter notifications in
+        // the batch report the same instance_ptr
        bool is_single_instance_ptr;
-
-        // Scratch space, used to generate artificial physically addressed notifications.
-        // Virtual address notifications are always aligned to 64k. This means up to 16
-        // different physical locations could have been accessed to trigger one notification.
-        // The sub-granularity mask can correspond to any of them.
-        struct
-        {
-            uvm_processor_id_t resident_processors[16];
-            uvm_gpu_phys_address_t phys_addresses[16];
-            uvm_access_counter_buffer_entry_t phys_entry;
-        } scratch;
    } virt;

    struct
@@ -464,8 +455,8 @@ struct uvm_access_counter_service_batch_context_struct
        NvU32                              num_notifications;

        // Boolean used to avoid sorting the fault batch by aperture if we
-        // determine at fetch time that all the access counter notifications in the
-        // batch report the same aperture
+        // determine at fetch time that all the access counter notifications in
+        // the batch report the same aperture
        bool                              is_single_aperture;
    } phys;

@@ -661,8 +652,8 @@ struct uvm_gpu_struct
    struct
    {
        // Big page size used by the internal UVM VA space
-        // Notably it may be different than the big page size used by a user's VA
-        // space in general.
+        // Notably it may be different than the big page size used by a user's
+        // VA space in general.
        NvU32 internal_size;
    } big_page;

@@ -688,8 +679,8 @@ struct uvm_gpu_struct
        // lazily-populated array of peer GPUs, indexed by the peer's GPU index
        uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS];

-        // Leaf spinlock used to synchronize access to the peer_gpus table so that
-        // it can be safely accessed from the access counters bottom half
+        // Leaf spinlock used to synchronize access to the peer_gpus table so
+        // that it can be safely accessed from the access counters bottom half
        uvm_spinlock_t peer_gpus_lock;
    } peer_info;

@@ -980,6 +971,10 @@ struct uvm_parent_gpu_struct

    bool plc_supported;

+    // If true, page_tree initialization pre-populates no_ats_ranges. It only
+    // affects ATS systems.
+    bool no_ats_range_required;
+
    // Parameters used by the TLB batching API
    struct
    {
@@ -1051,14 +1046,16 @@ struct uvm_parent_gpu_struct
    // Interrupt handling state and locks
    uvm_isr_info_t isr;

-    // Fault buffer info. This is only valid if supports_replayable_faults is set to true
+    // Fault buffer info. This is only valid if supports_replayable_faults is
+    // set to true.
    uvm_fault_buffer_info_t fault_buffer_info;

    // PMM lazy free processing queue.
    // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
    nv_kthread_q_t lazy_free_q;

-    // Access counter buffer info. This is only valid if supports_access_counters is set to true
+    // Access counter buffer info. This is only valid if
+    // supports_access_counters is set to true.
    uvm_access_counter_buffer_info_t access_counter_buffer_info;

    // Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs.
@@ -1108,7 +1105,7 @@ struct uvm_parent_gpu_struct
    uvm_rb_tree_t instance_ptr_table;
    uvm_spinlock_t instance_ptr_table_lock;

-    // This is set to true if the GPU belongs to an SLI group. Else, set to false.
+    // This is set to true if the GPU belongs to an SLI group.
    bool sli_enabled;

    struct
@@ -1135,8 +1132,8 @@ struct uvm_parent_gpu_struct
    // environment, rather than using the peer-id field of the PTE (which can
    // only address 8 gpus), all gpus are assigned a 47-bit physical address
    // space by the fabric manager. Any physical address access to these
-    // physical address spaces are routed through the switch to the corresponding
-    // peer.
+    // physical address spaces are routed through the switch to the
+    // corresponding peer.
    struct
    {
        bool is_nvswitch_connected;
@@ -1162,6 +1159,16 @@ struct uvm_parent_gpu_struct
        NvU64 memory_window_start;
        NvU64 memory_window_end;
    } system_bus;
+
+    // WAR to issue ATS TLB invalidation commands ourselves.
+    struct
+    {
+        uvm_mutex_t smmu_lock;
+        struct page *smmu_cmdq;
+        void __iomem *smmu_cmdqv_base;
+        unsigned long smmu_prod;
+        unsigned long smmu_cons;
+    } smmu_war;
 };

 static const char *uvm_gpu_name(uvm_gpu_t *gpu)
@@ -1351,7 +1358,8 @@ void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
 // They must not be the same gpu.
 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);

-// Get the processor id accessible by the given GPU for the given physical address
+// Get the processor id accessible by the given GPU for the given physical
+// address.
 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);

 // Get the P2P capabilities between the gpus with the given indexes
@@ -1448,9 +1456,9 @@ NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);

 // Check for ECC errors without calling into RM
 //
-// Calling into RM is problematic in many places, this check is always safe to do.
-// Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error and
-// it's required to call uvm_gpu_check_ecc_error() to be sure.
+// Calling into RM is problematic in many places, this check is always safe to
+// do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error
+// and it's required to call uvm_gpu_check_ecc_error() to be sure.
 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);

 // Map size bytes of contiguous sysmem on the GPU for physical access
@@ -1507,6 +1515,8 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
 // The GPU must be initialized before calling this function.
 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);

+bool uvm_platform_uses_canonical_form_address(void);
+
 // Returns addr's canonical form for host systems that use canonical form
 // addresses.
 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
@@ -1553,8 +1563,9 @@ uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu);
 // Debug print of GPU properties
 void uvm_gpu_print(uvm_gpu_t *gpu);

-// Add the given instance pointer -> user_channel mapping to this GPU. The bottom
-// half GPU page fault handler uses this to look up the VA space for GPU faults.
+// Add the given instance pointer -> user_channel mapping to this GPU. The
+// bottom half GPU page fault handler uses this to look up the VA space for GPU
+// faults.
 NV_STATUS uvm_gpu_add_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);
 void uvm_gpu_remove_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);

--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@@ -33,17 +33,17 @@
 #include "uvm_va_space_mm.h"
 #include "uvm_pmm_sysmem.h"
 #include "uvm_perf_module.h"
+#include "uvm_ats_ibm.h"

 #define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_MIN     1
 #define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT 256
-#define UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT "2m"
+#define UVM_PERF_ACCESS_COUNTER_GRANULARITY         UVM_ACCESS_COUNTER_GRANULARITY_2M
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MIN       1
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MAX       ((1 << 16) - 1)
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT   256

-#define UVM_ACCESS_COUNTER_ACTION_NOTIFY 0x1
-#define UVM_ACCESS_COUNTER_ACTION_CLEAR  0x2
-#define UVM_ACCESS_COUNTER_ON_MANAGED    0x4
+#define UVM_ACCESS_COUNTER_ACTION_CLEAR     0x1
+#define UVM_ACCESS_COUNTER_PHYS_ON_MANAGED  0x2

 // Each page in a tracked physical range may belong to a different VA Block. We
 // preallocate an array of reverse map translations. However, access counter
@@ -54,12 +54,6 @@
 #define UVM_MAX_TRANSLATION_SIZE (2 * 1024 * 1024ULL)
 #define UVM_SUB_GRANULARITY_REGIONS 32

-// The GPU offers the following tracking granularities: 64K, 2M, 16M, 16G
-//
-// Use the largest granularity to minimize the number of access counter
-// notifications. This is fine because we simply drop the notifications during
-// normal operation, and tests override these values.
-static UVM_ACCESS_COUNTER_GRANULARITY g_uvm_access_counter_granularity;
 static unsigned g_uvm_access_counter_threshold;

 // Per-VA space access counters information
@@ -87,7 +81,6 @@ static int uvm_perf_access_counter_momc_migration_enable = -1;
 static unsigned uvm_perf_access_counter_batch_count = UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT;

 // See module param documentation below
-static char *uvm_perf_access_counter_granularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT;
 static unsigned uvm_perf_access_counter_threshold = UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT;

 // Module parameters for the tunables
@@ -100,10 +93,6 @@ MODULE_PARM_DESC(uvm_perf_access_counter_momc_migration_enable,
                 "Whether MOMC access counters will trigger migrations."
                 "Valid values: <= -1 (default policy), 0 (off), >= 1 (on)");
 module_param(uvm_perf_access_counter_batch_count, uint, S_IRUGO);
-module_param(uvm_perf_access_counter_granularity, charp, S_IRUGO);
-MODULE_PARM_DESC(uvm_perf_access_counter_granularity,
-                 "Size of the physical memory region tracked by each counter. Valid values as"
-                 "of Volta: 64k, 2m, 16m, 16g");
 module_param(uvm_perf_access_counter_threshold, uint, S_IRUGO);
 MODULE_PARM_DESC(uvm_perf_access_counter_threshold,
                 "Number of remote accesses on a region required to trigger a notification."
@@ -136,7 +125,7 @@ static va_space_access_counters_info_t *va_space_access_counters_info_get(uvm_va

 // Whether access counter migrations are enabled or not. The policy is as
 // follows:
-// - MIMC migrations are enabled by default on P9 systems with ATS support
+// - MIMC migrations are disabled by default on all systems except P9.
 // - MOMC migrations are disabled by default on all systems
 // - Users can override this policy by specifying on/off
 static bool is_migration_enabled(uvm_access_counter_type_t type)
@@ -159,7 +148,10 @@ static bool is_migration_enabled(uvm_access_counter_type_t type)
    if (type == UVM_ACCESS_COUNTER_TYPE_MOMC)
        return false;

-    return g_uvm_global.ats.supported;
+    if (UVM_ATS_IBM_SUPPORTED())
+        return g_uvm_global.ats.supported;
+
+    return false;
 }

 // Create the access counters tracking struct for the given VA space
@@ -225,30 +217,18 @@ static NV_STATUS config_granularity_to_bytes(UVM_ACCESS_COUNTER_GRANULARITY gran
    return NV_OK;
 }

-// Clear the given access counter and add it to the per-GPU clear tracker
-static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
-                                               const uvm_access_counter_buffer_entry_t *entry)
+// Clear the access counter notifications and add it to the per-GPU clear
+// tracker.
+static NV_STATUS access_counter_clear_notifications(uvm_gpu_t *gpu,
+                                                    uvm_access_counter_buffer_entry_t **notification_start,
+                                                    NvU32 num_notifications)
 {
+    NvU32 i;
    NV_STATUS status;
    uvm_push_t push;
    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;

-    if (entry->address.is_virtual) {
-        status = uvm_push_begin(gpu->channel_manager,
-                                UVM_CHANNEL_TYPE_MEMOPS,
-                                &push,
-                                "Clear access counter with virtual address: 0x%llx",
-                                entry->address.address);
-    }
-    else {
-        status = uvm_push_begin(gpu->channel_manager,
-                                UVM_CHANNEL_TYPE_MEMOPS,
-                                &push,
-                                "Clear access counter with physical address: 0x%llx:%s",
-                                entry->address.address,
-                                uvm_aperture_string(entry->address.aperture));
-    }
-
+    status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &push, "Clear access counter batch");
    if (status != NV_OK) {
        UVM_ERR_PRINT("Error creating push to clear access counters: %s, GPU %s\n",
                      nvstatusToString(status),
@@ -256,7 +236,8 @@ static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
        return status;
    }

-    gpu->parent->host_hal->access_counter_clear_targeted(&push, entry);
+    for (i = 0; i < num_notifications; i++)
+        gpu->parent->host_hal->access_counter_clear_targeted(&push, notification_start[i]);

    uvm_push_end(&push);

@@ -381,25 +362,6 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
        g_uvm_access_counter_threshold = uvm_perf_access_counter_threshold;
    }

-    if (strcmp(uvm_perf_access_counter_granularity, "64k") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_64K;
-    }
-    else if (strcmp(uvm_perf_access_counter_granularity, "2m") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
-    }
-    else if (strcmp(uvm_perf_access_counter_granularity, "16m") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16M;
-    }
-    else if (strcmp(uvm_perf_access_counter_granularity, "16g") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16G;
-    }
-    else {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
-        pr_info("Invalid value '%s' for uvm_perf_access_counter_granularity, using '%s' instead",
-                uvm_perf_access_counter_granularity,
-                UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT);
-    }
-
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
    UVM_ASSERT(parent_gpu->access_counter_buffer_hal != NULL);

@@ -422,7 +384,7 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
    UVM_ASSERT(access_counters->rm_info.bufferSize %
               parent_gpu->access_counter_buffer_hal->entry_size(parent_gpu) == 0);

-    status = config_granularity_to_bytes(g_uvm_access_counter_granularity, &granularity_bytes);
+    status = config_granularity_to_bytes(UVM_PERF_ACCESS_COUNTER_GRANULARITY, &granularity_bytes);
    UVM_ASSERT(status == NV_OK);
    if (granularity_bytes > UVM_MAX_TRANSLATION_SIZE)
        UVM_ASSERT(granularity_bytes % UVM_MAX_TRANSLATION_SIZE == 0);
@@ -641,8 +603,8 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac
    else {
        UvmGpuAccessCntrConfig default_config =
        {
-            .mimcGranularity = g_uvm_access_counter_granularity,
-            .momcGranularity = g_uvm_access_counter_granularity,
+            .mimcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
+            .momcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
            .mimcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
            .momcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
            .threshold = g_uvm_access_counter_threshold,
@@ -767,6 +729,22 @@ static int cmp_sort_virt_notifications_by_instance_ptr(const void *_a, const voi
    return cmp_access_counter_instance_ptr(a, b);
 }

+// Sort comparator for pointers to GVA access counter notification buffer
+// entries that sorts by va_space, and fault address.
+static int cmp_sort_virt_notifications_by_va_space_address(const void *_a, const void *_b)
+{
+    const uvm_access_counter_buffer_entry_t **a = (const uvm_access_counter_buffer_entry_t **)_a;
+    const uvm_access_counter_buffer_entry_t **b = (const uvm_access_counter_buffer_entry_t **)_b;
+
+    int result;
+
+    result = UVM_CMP_DEFAULT((*a)->virtual_info.va_space, (*b)->virtual_info.va_space);
+    if (result != 0)
+        return result;
+
+    return UVM_CMP_DEFAULT((*a)->address.address, (*b)->address.address);
+}
+
 // Sort comparator for pointers to GPA access counter notification buffer
 // entries that sorts by physical address' aperture
 static int cmp_sort_phys_notifications_by_processor_id(const void *_a, const void *_b)
@@ -924,12 +902,11 @@ static void translate_virt_notifications_instance_ptrs(uvm_gpu_t *gpu,

 // GVA notifications provide an instance_ptr and ve_id that can be directly
 // translated to a VA space. In order to minimize translations, we sort the
-// entries by instance_ptr.
+// entries by instance_ptr, va_space and notification address in that order.
 static void preprocess_virt_notifications(uvm_gpu_t *gpu,
                                          uvm_access_counter_service_batch_context_t *batch_context)
 {
    if (!batch_context->virt.is_single_instance_ptr) {
-        // Sort by instance_ptr
        sort(batch_context->virt.notifications,
             batch_context->virt.num_notifications,
             sizeof(*batch_context->virt.notifications),
@@ -938,6 +915,12 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
    }

    translate_virt_notifications_instance_ptrs(gpu, batch_context);
+
+    sort(batch_context->virt.notifications,
+         batch_context->virt.num_notifications,
+         sizeof(*batch_context->virt.notifications),
+         cmp_sort_virt_notifications_by_va_space_address,
+         NULL);
 }

 // GPA notifications provide a physical address and an aperture. Sort
@@ -946,7 +929,6 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
 static void preprocess_phys_notifications(uvm_access_counter_service_batch_context_t *batch_context)
 {
    if (!batch_context->phys.is_single_aperture) {
-        // Sort by instance_ptr
        sort(batch_context->phys.notifications,
             batch_context->phys.num_notifications,
             sizeof(*batch_context->phys.notifications),
@@ -955,6 +937,28 @@ static void preprocess_phys_notifications(uvm_access_counter_service_batch_conte
    }
 }

+static NV_STATUS notify_tools_and_process_flags(uvm_gpu_t *gpu,
+                                                uvm_access_counter_buffer_entry_t **notification_start,
+                                                NvU32 num_entries,
+                                                NvU32 flags)
+{
+    NV_STATUS status = NV_OK;
+
+    if (uvm_enable_builtin_tests) {
+        // TODO: Bug 4310744: [UVM][TOOLS] Attribute access counter tools events
+        //                    to va_space instead of broadcasting.
+        NvU32 i;
+
+        for (i = 0; i < num_entries; i++)
+            uvm_tools_broadcast_access_counter(gpu, notification_start[i], flags & UVM_ACCESS_COUNTER_PHYS_ON_MANAGED);
+    }
+
+    if (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR)
+        status = access_counter_clear_notifications(gpu, notification_start, num_entries);
+
+    return status;
+}
+
 static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
                                         uvm_va_block_t *va_block,
                                         uvm_va_block_retry_t *va_block_retry,
@@ -1163,7 +1167,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
                                              const uvm_access_counter_buffer_entry_t *current_entry,
                                              const uvm_reverse_map_t *reverse_mappings,
                                              size_t num_reverse_mappings,
-                                              unsigned *out_flags)
+                                              NvU32 *out_flags)
 {
    size_t index;
    uvm_va_block_t *va_block = reverse_mappings[0].va_block;
@@ -1190,7 +1194,6 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
        // If an mm is registered with the VA space, we have to retain it
        // in order to lock it before locking the VA space.
        mm = uvm_va_space_mm_retain_lock(va_space);
-
        uvm_va_space_down_read(va_space);

        // Re-check that the VA block is valid after taking the VA block lock.
@@ -1251,7 +1254,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *current_entry,
                                        const uvm_reverse_map_t *reverse_mappings,
                                        size_t num_reverse_mappings,
-                                        unsigned *out_flags)
+                                        NvU32 *out_flags)
 {
    NV_STATUS status = NV_OK;
    size_t index;
@@ -1259,7 +1262,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;

    for (index = 0; index < num_reverse_mappings; ++index) {
-        unsigned out_flags_local = 0;
+        NvU32 out_flags_local = 0;
        status = service_phys_single_va_block(gpu,
                                              batch_context,
                                              current_entry,
@@ -1318,7 +1321,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
                                                       NvU64 address,
                                                       unsigned long sub_granularity,
                                                       size_t *num_reverse_mappings,
-                                                       unsigned *out_flags)
+                                                       NvU32 *out_flags)
 {
    NV_STATUS status;
    NvU32 region_start, region_end;
@@ -1327,7 +1330,10 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,

    // Get the reverse_map translations for all the regions set in the
    // sub_granularity field of the counter.
-    for_each_sub_granularity_region(region_start, region_end, sub_granularity, config->sub_granularity_regions_per_translation) {
+    for_each_sub_granularity_region(region_start,
+                                    region_end,
+                                    sub_granularity,
+                                    config->sub_granularity_regions_per_translation) {
        NvU64 local_address = address + region_start * config->sub_granularity_region_size;
        NvU32 local_translation_size = (region_end - region_start) * config->sub_granularity_region_size;
        uvm_reverse_map_t *local_reverse_mappings = batch_context->phys.translations + *num_reverse_mappings;
@@ -1376,7 +1382,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
 static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
                                           uvm_access_counter_service_batch_context_t *batch_context,
                                           const uvm_access_counter_buffer_entry_t *current_entry,
-                                           unsigned *out_flags)
+                                           NvU32 *out_flags)
 {
    NvU64 address;
    NvU64 translation_index;
@@ -1387,7 +1393,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
    size_t total_reverse_mappings = 0;
    uvm_gpu_t *resident_gpu = NULL;
    NV_STATUS status = NV_OK;
-    unsigned flags = 0;
+    NvU32 flags = 0;

    address = current_entry->address.address;
    UVM_ASSERT(address % config->translation_size == 0);
@@ -1415,7 +1421,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,

    for (translation_index = 0; translation_index < config->translations_per_counter; ++translation_index) {
        size_t num_reverse_mappings;
-        unsigned out_flags_local = 0;
+        NvU32 out_flags_local = 0;
        status = service_phys_notification_translation(gpu,
                                                       resident_gpu,
                                                       batch_context,
@@ -1437,11 +1443,8 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
        sub_granularity = sub_granularity >> config->sub_granularity_regions_per_translation;
    }

-    // Currently we only report events for our tests, not for tools
-    if (uvm_enable_builtin_tests) {
-        *out_flags |= UVM_ACCESS_COUNTER_ACTION_NOTIFY;
-        *out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_ON_MANAGED : 0);
-    }
+    if (uvm_enable_builtin_tests)
+        *out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_PHYS_ON_MANAGED : 0);

    if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
        *out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
@@ -1454,22 +1457,21 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
                                            uvm_access_counter_service_batch_context_t *batch_context)
 {
    NvU32 i;
+    uvm_access_counter_buffer_entry_t **notifications = batch_context->phys.notifications;
+
    preprocess_phys_notifications(batch_context);

    for (i = 0; i < batch_context->phys.num_notifications; ++i) {
        NV_STATUS status;
-        uvm_access_counter_buffer_entry_t *current_entry = batch_context->phys.notifications[i];
-        unsigned flags = 0;
+        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
+        NvU32 flags = 0;

        if (!UVM_ID_IS_VALID(current_entry->physical_info.resident_id))
            continue;

        status = service_phys_notification(gpu, batch_context, current_entry, &flags);
-        if (flags & UVM_ACCESS_COUNTER_ACTION_NOTIFY)
-            uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);

-        if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
-            status = access_counter_clear_targeted(gpu, current_entry);
+        notify_tools_and_process_flags(gpu, &notifications[i], 1, flags);

        if (status != NV_OK)
            return status;
@@ -1478,152 +1480,218 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
    return NV_OK;
 }

-static int cmp_sort_gpu_phys_addr(const void *_a, const void *_b)
+static NV_STATUS service_notification_va_block_helper(struct mm_struct *mm,
+                                                      uvm_va_block_t *va_block,
+                                                      uvm_processor_id_t processor,
+                                                      uvm_access_counter_service_batch_context_t *batch_context)
 {
-    return uvm_gpu_phys_addr_cmp(*(uvm_gpu_phys_address_t*)_a,
-                                 *(uvm_gpu_phys_address_t*)_b);
-}
+    uvm_va_block_retry_t va_block_retry;
+    uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
+    uvm_service_block_context_t *service_context = &batch_context->block_service_context;

-static bool gpu_phys_same_region(uvm_gpu_phys_address_t a, uvm_gpu_phys_address_t b, NvU64 granularity)
-{
-    if (a.aperture != b.aperture)
-        return false;
-
-    UVM_ASSERT(is_power_of_2(granularity));
-
-    return UVM_ALIGN_DOWN(a.address, granularity) == UVM_ALIGN_DOWN(b.address, granularity);
-}
-
-static bool phys_address_in_accessed_sub_region(uvm_gpu_phys_address_t address,
-                                                NvU64 region_size,
-                                                NvU64 sub_region_size,
-                                                NvU32 accessed_mask)
-{
-    const unsigned accessed_index = (address.address % region_size) / sub_region_size;
-
-    // accessed_mask is only filled for tracking granularities larger than 64K
-    if (region_size == UVM_PAGE_SIZE_64K)
-        return true;
-
-    UVM_ASSERT(accessed_index < 32);
-    return ((1 << accessed_index) & accessed_mask) != 0;
-}
-
-static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
-                                           uvm_access_counter_service_batch_context_t *batch_context,
-                                           const uvm_access_counter_buffer_entry_t *current_entry,
-                                           unsigned *out_flags)
-{
-    NV_STATUS status = NV_OK;
-    NvU64 notification_size;
-    NvU64 address;
-    uvm_processor_id_t *resident_processors = batch_context->virt.scratch.resident_processors;
-    uvm_gpu_phys_address_t *phys_addresses = batch_context->virt.scratch.phys_addresses;
-    int num_addresses = 0;
-    int i;
-
-    // Virtual address notifications are always 64K aligned
-    NvU64 region_start = current_entry->address.address;
-    NvU64 region_end = current_entry->address.address + UVM_PAGE_SIZE_64K;
-    
-
-    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
-    uvm_access_counter_type_t counter_type = current_entry->counter_type;
-
-    const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters, counter_type);
-
-    uvm_va_space_t *va_space = current_entry->virtual_info.va_space;
-
-    UVM_ASSERT(counter_type == UVM_ACCESS_COUNTER_TYPE_MIMC);
-
-    // Entries with NULL va_space are simply dropped.
-    if (!va_space)
+    if (uvm_page_mask_empty(accessed_pages))
        return NV_OK;

-    status = config_granularity_to_bytes(config->rm.granularity, &notification_size);
-    if (status != NV_OK)
-        return status;
+    uvm_assert_mutex_locked(&va_block->lock);

-    // Collect physical locations that could have been touched
-    // in the reported 64K VA region. The notification mask can
-    // correspond to any of them.
-    uvm_va_space_down_read(va_space);
-    for (address = region_start; address < region_end;) {
-        uvm_va_block_t *va_block;
+    service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
+    service_context->num_retries = 0;
+    service_context->block_context.mm = mm;

-        NV_STATUS local_status = uvm_va_block_find(va_space, address, &va_block);
-        if (local_status == NV_ERR_INVALID_ADDRESS || local_status == NV_ERR_OBJECT_NOT_FOUND) {
-            address += PAGE_SIZE;
-            continue;
-        }
+    return UVM_VA_BLOCK_RETRY_LOCKED(va_block,
+                                     &va_block_retry,
+                                     service_va_block_locked(processor,
+                                                             va_block,
+                                                             &va_block_retry,
+                                                             service_context,
+                                                             accessed_pages));
+}

-        uvm_mutex_lock(&va_block->lock);
-        while (address < va_block->end && address < region_end) {
-            const unsigned page_index = uvm_va_block_cpu_page_index(va_block, address);
+static void expand_notification_block(struct mm_struct *mm,
+                                      uvm_gpu_va_space_t *gpu_va_space,
+                                      uvm_va_block_t *va_block,
+                                      uvm_page_mask_t *accessed_pages,
+                                      const uvm_access_counter_buffer_entry_t *current_entry)
+{
+    NvU64 addr;
+    NvU64 granularity = 0;
+    uvm_gpu_t *resident_gpu = NULL;
+    uvm_processor_id_t resident_id;
+    uvm_page_index_t page_index;
+    uvm_gpu_t *gpu = gpu_va_space->gpu;
+    const uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+    const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters,
+                                                                             UVM_ACCESS_COUNTER_TYPE_MIMC);

-            // UVM va_block always maps the closest resident location to processor
-            const uvm_processor_id_t res_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
+    config_granularity_to_bytes(config->rm.granularity, &granularity);

-            // Add physical location if it's valid and not local vidmem
-            if (UVM_ID_IS_VALID(res_id) && !uvm_id_equal(res_id, gpu->id)) {
-                uvm_gpu_phys_address_t phys_address = uvm_va_block_res_phys_page_address(va_block, page_index, res_id, gpu);
-                if (phys_address_in_accessed_sub_region(phys_address,
-                                                        notification_size,
-                                                        config->sub_granularity_region_size,
-                                                        current_entry->sub_granularity)) {
-                    resident_processors[num_addresses] = res_id;
-                    phys_addresses[num_addresses] = phys_address;
-                    ++num_addresses;
-                }
-                else {
-                    UVM_DBG_PRINT_RL("Skipping phys address %llx:%s, because it couldn't have been accessed in mask %x",
-                                     phys_address.address,
-                                     uvm_aperture_string(phys_address.aperture),
-                                     current_entry->sub_granularity);
-                }
-            }
+    // Granularities other than 2MB can only be enabled by UVM tests. Do nothing
+    // in that case.
+    if (granularity != UVM_PAGE_SIZE_2M)
+        return;

-            address += PAGE_SIZE;
-        }
-        uvm_mutex_unlock(&va_block->lock);
+    addr = current_entry->address.address;
+
+    uvm_assert_rwsem_locked(&gpu_va_space->va_space->lock);
+    uvm_assert_mutex_locked(&va_block->lock);
+
+    page_index = uvm_va_block_cpu_page_index(va_block, addr);
+
+    resident_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
+
+    // resident_id might be invalid or might already be the same as the GPU
+    // which received the notification if the memory was already migrated before
+    // acquiring the locks either during the servicing of previous notifications
+    // or during faults or because of explicit migrations or if the VA range was
+    // freed after receving the notification. Return NV_OK in such cases.
+    if (!UVM_ID_IS_VALID(resident_id) || uvm_id_equal(resident_id, gpu->id))
+        return;
+
+    if (UVM_ID_IS_GPU(resident_id))
+        resident_gpu = uvm_va_space_get_gpu(gpu_va_space->va_space, resident_id);
+
+    if (uvm_va_block_get_physical_size(va_block, resident_id, page_index) != granularity) {
+        uvm_page_mask_set(accessed_pages, page_index);
    }
-    uvm_va_space_up_read(va_space);
+    else {
+        NvU32 region_start;
+        NvU32 region_end;
+        unsigned long sub_granularity = current_entry->sub_granularity;
+        NvU32 num_regions = config->sub_granularity_regions_per_translation;
+        NvU32 num_sub_pages = config->sub_granularity_region_size / PAGE_SIZE;
+        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);

-    // The addresses need to be sorted to aid coalescing.
-    sort(phys_addresses,
-         num_addresses,
-         sizeof(*phys_addresses),
-         cmp_sort_gpu_phys_addr,
-         NULL);
+        UVM_ASSERT(num_sub_pages >= 1);

-    for (i = 0; i < num_addresses; ++i) {
-        uvm_access_counter_buffer_entry_t *fake_entry = &batch_context->virt.scratch.phys_entry;
-
-        // Skip the current pointer if the physical region was already handled
-        if (i > 0 && gpu_phys_same_region(phys_addresses[i - 1], phys_addresses[i], notification_size)) {
-            UVM_ASSERT(uvm_id_equal(resident_processors[i - 1], resident_processors[i]));
-            continue;
+        // region_start and region_end refer to sub_granularity indices, not
+        // page_indices.
+        for_each_sub_granularity_region(region_start, region_end, sub_granularity, num_regions) {
+            uvm_page_mask_region_fill(accessed_pages,
+                                      uvm_va_block_region(region_start * num_sub_pages,
+                                                          region_end * num_sub_pages));
        }
-        UVM_DBG_PRINT_RL("Faking MIMC address[%i/%i]: %llx (granularity mask: %llx) in aperture %s on device %s\n",
-                         i,
-                         num_addresses,
-                         phys_addresses[i].address,
-                         notification_size - 1,
-                         uvm_aperture_string(phys_addresses[i].aperture),
-                         uvm_gpu_name(gpu));

-        // Construct a fake phys addr AC entry
-        fake_entry->counter_type = current_entry->counter_type;
-        fake_entry->address.address = UVM_ALIGN_DOWN(phys_addresses[i].address, notification_size);
-        fake_entry->address.aperture = phys_addresses[i].aperture;
-        fake_entry->address.is_virtual = false;
-        fake_entry->physical_info.resident_id = resident_processors[i];
-        fake_entry->counter_value = current_entry->counter_value;
-        fake_entry->sub_granularity = current_entry->sub_granularity;
+        // Remove pages in the va_block which are not resident on resident_id.
+        // If the GPU is heavily accessing those pages, future access counter
+        // migrations will migrate them to the GPU.
+        uvm_page_mask_and(accessed_pages, accessed_pages, resident_mask);
+    }
+}

-        status = service_phys_notification(gpu, batch_context, fake_entry, out_flags);
-        if (status != NV_OK)
+static NV_STATUS service_virt_notifications_in_block(struct mm_struct *mm,
+                                                     uvm_gpu_va_space_t *gpu_va_space,
+                                                     uvm_va_block_t *va_block,
+                                                     uvm_access_counter_service_batch_context_t *batch_context,
+                                                     NvU32 index,
+                                                     NvU32 *out_index)
+{
+    NvU32 i = index;
+    NvU32 flags = 0;
+    NV_STATUS status = NV_OK;
+    NV_STATUS flags_status;
+    uvm_gpu_t *gpu = gpu_va_space->gpu;
+    uvm_va_space_t *va_space = gpu_va_space->va_space;
+    uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
+    uvm_access_counter_buffer_entry_t **notifications = batch_context->virt.notifications;
+
+    UVM_ASSERT(va_block);
+    UVM_ASSERT(i < batch_context->virt.num_notifications);
+
+    uvm_assert_rwsem_locked(&va_space->lock);
+
+    uvm_page_mask_zero(accessed_pages);
+
+    uvm_mutex_lock(&va_block->lock);
+
+    while (i < batch_context->virt.num_notifications) {
+        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
+        NvU64 address = current_entry->address.address;
+
+        if ((current_entry->virtual_info.va_space != va_space) || (address > va_block->end)) {
+            *out_index = i;
            break;
+        }
+
+        expand_notification_block(mm, gpu_va_space, va_block, accessed_pages, current_entry);
+
+        i++;
+        *out_index = i;
+    }
+
+    status = service_notification_va_block_helper(mm, va_block, gpu->id, batch_context);
+
+    uvm_mutex_unlock(&va_block->lock);
+
+    // Atleast one notification should have been processed.
+    UVM_ASSERT(index < *out_index);
+
+    if (status == NV_OK)
+        flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
+
+    flags_status = notify_tools_and_process_flags(gpu, &notifications[index], *out_index - index, flags);
+
+    if ((status == NV_OK) && (flags_status != NV_OK))
+        status = flags_status;
+
+    return status;
+}
+
+static NV_STATUS service_virt_notifications_batch(struct mm_struct *mm,
+                                                  uvm_gpu_va_space_t *gpu_va_space,
+                                                  uvm_access_counter_service_batch_context_t *batch_context,
+                                                  NvU32 index,
+                                                  NvU32 *out_index)
+{
+    NV_STATUS status;
+    uvm_va_block_t *va_block;
+    uvm_va_space_t *va_space = gpu_va_space->va_space;
+    uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[index];
+    NvU64 address = current_entry->address.address;
+
+    UVM_ASSERT(va_space);
+
+    uvm_assert_rwsem_locked(&va_space->lock);
+
+    // Virtual address notifications are always 64K aligned
+    UVM_ASSERT(IS_ALIGNED(address, UVM_PAGE_SIZE_64K));
+
+    // TODO: Bug 4309292: [UVM][HMM] Re-enable access counter HMM block
+    //                    migrations for virtual notifications on configs with
+    //                    4KB page size
+    status = uvm_va_block_find(va_space, address, &va_block);
+    if ((status == NV_OK) && !uvm_va_block_is_hmm(va_block)) {
+
+        UVM_ASSERT(va_block);
+
+        status = service_virt_notifications_in_block(mm, gpu_va_space, va_block, batch_context, index, out_index);
+    }
+    else {
+        NvU32 flags = 0;
+
+        UVM_ASSERT((status == NV_ERR_OBJECT_NOT_FOUND) ||
+                   (status == NV_ERR_INVALID_ADDRESS)  ||
+                   uvm_va_block_is_hmm(va_block));
+
+        // NV_ERR_OBJECT_NOT_FOUND is returned if the VA range is valid but no
+        // VA block has been allocated yet. This can happen if there are stale
+        // notifications in the batch. A new VA range may have been allocated in
+        // that range. So, clear the notification entry to continue getting
+        // notifications for the new VA range.
+        if (status == NV_ERR_OBJECT_NOT_FOUND)
+            flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
+
+        // NV_ERR_INVALID_ADDRESS is returned if the corresponding VA range
+        // doesn't exist or it's not a managed range. Access counter migrations
+        // are not currently supported on such ranges.
+        //
+        // TODO: Bug 1990466: [uvm] Use access counters to trigger migrations
+        // When support for SAM migrations is addded, clear the notification
+        // entry if the VA range doesn't exist in order to receive notifications
+        // when a new VA range is allocated in that region.
+        status = notify_tools_and_process_flags(gpu_va_space->gpu, &batch_context->virt.notifications[index], 1, flags);
+        *out_index = index + 1;
+
+        status = NV_OK;
    }

    return status;
@@ -1632,33 +1700,67 @@ static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
 static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
                                            uvm_access_counter_service_batch_context_t *batch_context)
 {
-    NvU32 i;
+    NvU32 i = 0;
    NV_STATUS status = NV_OK;
+    struct mm_struct *mm = NULL;
+    uvm_va_space_t *va_space = NULL;
+    uvm_va_space_t *prev_va_space = NULL;
+    uvm_gpu_va_space_t *gpu_va_space = NULL;
+
+    // TODO: Bug 4299018 : Add support for virtual access counter migrations on
+    //                     4K page sizes.
+    if (PAGE_SIZE == UVM_PAGE_SIZE_4K) {
+        return notify_tools_and_process_flags(gpu,
+                                              batch_context->virt.notifications,
+                                              batch_context->virt.num_notifications,
+                                              0);
+    }
+
    preprocess_virt_notifications(gpu, batch_context);

-    for (i = 0; i < batch_context->virt.num_notifications; ++i) {
-        unsigned flags = 0;
+    while (i < batch_context->virt.num_notifications) {
        uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[i];
+        va_space = current_entry->virtual_info.va_space;

-        status = service_virt_notification(gpu, batch_context, current_entry, &flags);
+        if (va_space != prev_va_space) {

-        UVM_DBG_PRINT_RL("Processed virt access counter (%d/%d): %sMANAGED (status: %d) clear: %s\n",
-                         i + 1,
-                         batch_context->virt.num_notifications,
-                         (flags & UVM_ACCESS_COUNTER_ON_MANAGED) ? "" : "NOT ",
-                         status,
-                         (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR) ? "YES" : "NO");
+            // New va_space detected, drop locks of the old va_space.
+            if (prev_va_space) {
+                uvm_va_space_up_read(prev_va_space);
+                uvm_va_space_mm_release_unlock(prev_va_space, mm);

-        if (uvm_enable_builtin_tests)
-            uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
+                mm = NULL;
+                gpu_va_space = NULL;
+            }

-        if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
-            status = access_counter_clear_targeted(gpu, current_entry);
+            // Acquire locks for the new va_space.
+            if (va_space) {
+                mm = uvm_va_space_mm_retain_lock(va_space);
+                uvm_va_space_down_read(va_space);
+
+                gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
+            }
+
+            prev_va_space = va_space;
+        }
+
+        if (va_space && gpu_va_space && uvm_va_space_has_access_counter_migrations(va_space)) {
+            status = service_virt_notifications_batch(mm, gpu_va_space, batch_context, i, &i);
+        }
+        else {
+            status = notify_tools_and_process_flags(gpu, &batch_context->virt.notifications[i], 1, 0);
+            i++;
+        }

        if (status != NV_OK)
            break;
    }

+    if (va_space) {
+        uvm_va_space_up_read(va_space);
+        uvm_va_space_mm_release_unlock(va_space, mm);
+    }
+
    return status;
 }

@@ -1941,6 +2043,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
    }
    else {
        uvm_access_counter_buffer_entry_t entry = { 0 };
+        uvm_access_counter_buffer_entry_t *notification = &entry;

        if (params->counter_type == UVM_TEST_ACCESS_COUNTER_TYPE_MIMC)
            entry.counter_type = UVM_ACCESS_COUNTER_TYPE_MIMC;
@@ -1950,7 +2053,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
        entry.bank = params->bank;
        entry.tag = params->tag;

-        status = access_counter_clear_targeted(gpu, &entry);
+        status = access_counter_clear_notifications(gpu, &notification, 1);
    }

    if (status == NV_OK)
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@@ -235,17 +235,27 @@ static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *par
    return NV_OK;
 }

-// In SRIOV, the UVM (guest) driver does not have access to the privileged
-// registers used to clear the faulted bit. Instead, UVM requests host RM to do
-// the clearing on its behalf, using a SW method.
 static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
 {
-    if (uvm_gpu_is_virt_mode_sriov(gpu)) {
-        UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
-        return true;
-    }
+    // If true, UVM uses a SW method to request RM to do the clearing on its
+    // behalf.
+    bool use_sw_method = false;

-    return false;
+    // In SRIOV, the UVM (guest) driver does not have access to the privileged
+    // registers used to clear the faulted bit.
+    if (uvm_gpu_is_virt_mode_sriov(gpu))
+        use_sw_method = true;
+
+    // In Confidential Computing access to the privileged registers is blocked,
+    // in order to prevent interference between guests, or between the
+    // (untrusted) host and the guests.
+    if (g_uvm_global.conf_computing_enabled)
+        use_sw_method = true;
+
+    if (use_sw_method)
+        UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
+
+    return use_sw_method;
 }

 static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
@@ -570,7 +580,7 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,

        ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;

-        ats_invalidate->write_faults_in_batch = false;
+        ats_invalidate->tlb_batch_pending = false;

        va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);

--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -362,7 +362,8 @@ static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
                                        "Cancel targeting instance_ptr {0x%llx:%s}\n",
                                        instance_ptr.address,
                                        uvm_aperture_string(instance_ptr.aperture));
-    } else {
+    }
+    else {
        status = uvm_push_begin_acquire(gpu->channel_manager,
                                        UVM_CHANNEL_TYPE_MEMOPS,
                                        &replayable_faults->replay_tracker,
@@ -697,9 +698,6 @@ static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_ty

 typedef enum
 {
-    // Fetch a batch of faults from the buffer.
-    FAULT_FETCH_MODE_BATCH_ALL,
-
    // Fetch a batch of faults from the buffer. Stop at the first entry that is
    // not ready yet
    FAULT_FETCH_MODE_BATCH_READY,
@@ -857,9 +855,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,
        // written out of order
        UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
            // We have some entry to work on. Let's do the rest later.
-            if (fetch_mode != FAULT_FETCH_MODE_ALL &&
-                fetch_mode != FAULT_FETCH_MODE_BATCH_ALL &&
-                fault_index > 0)
+            if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0)
                goto done;
        }

@@ -888,6 +884,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,

        current_entry->va_space = NULL;
        current_entry->filtered = false;
+        current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;

        if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) {
            UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count);
@@ -1184,7 +1181,11 @@ static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
    fault_entry->replayable.cancel_va_mode = cancel_va_mode;

    utlb->has_fatal_faults = true;
-    batch_context->has_fatal_faults = true;
+
+    if (!batch_context->fatal_va_space) {
+        UVM_ASSERT(fault_entry->va_space);
+        batch_context->fatal_va_space = fault_entry->va_space;
+    }
 }

 static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
@@ -1378,7 +1379,10 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
        UVM_ASSERT(current_entry->fault_access_type ==
                   uvm_fault_access_type_mask_highest(current_entry->access_type_mask));

-        current_entry->is_fatal            = false;
+        // Unserviceable faults were already skipped by the caller. There are no
+        // unserviceable fault types that could be in the same VA block as a
+        // serviceable fault.
+        UVM_ASSERT(!current_entry->is_fatal);
        current_entry->is_throttled        = false;
        current_entry->is_invalid_prefetch = false;

@@ -1512,7 +1516,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

    ++block_context->num_retries;

-    if (status == NV_OK && batch_context->has_fatal_faults)
+    if (status == NV_OK && batch_context->fatal_va_space)
        status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);

    return status;
@@ -1676,7 +1680,8 @@ static NV_STATUS service_fault_batch_ats_sub_vma(uvm_gpu_va_space_t *gpu_va_spac
        if (access_type <= UVM_FAULT_ACCESS_TYPE_READ) {
            cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
        }
-        else if (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) {
+	else {
+            UVM_ASSERT(access_type >= UVM_FAULT_ACCESS_TYPE_WRITE);
            if (uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ) &&
                !uvm_page_mask_test(reads_serviced_mask, page_index))
                cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
@@ -1735,6 +1740,10 @@ static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space,
        uvm_fault_access_type_t access_type = current_entry->fault_access_type;
        bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);

+        // ATS faults can't be unserviceable, since unserviceable faults require
+        // GMMU PTEs.
+        UVM_ASSERT(!current_entry->is_fatal);
+
        i++;

        update_batch_and_notify_fault(gpu_va_space->gpu,
@@ -1934,14 +1943,198 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
    return status;
 }

+// Called when a fault in the batch has been marked fatal. Flush the buffer
+// under the VA and mmap locks to remove any potential stale fatal faults, then
+// service all new faults for just that VA space and cancel those which are
+// fatal. Faults in other VA spaces are replayed when done and will be processed
+// when normal fault servicing resumes.
+static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
+{
+    NV_STATUS status = NV_OK;
+    NvU32 i;
+    uvm_va_space_t *va_space = batch_context->fatal_va_space;
+    uvm_gpu_va_space_t *gpu_va_space = NULL;
+    struct mm_struct *mm;
+    uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
+    uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
+    uvm_va_block_context_t *va_block_context = &service_context->block_context;
+
+    UVM_ASSERT(gpu->parent->replayable_faults_supported);
+    UVM_ASSERT(va_space);
+
+    // Perform the flush and re-fetch while holding the mmap_lock and the
+    // VA space lock. This avoids stale faults because it prevents any vma
+    // modifications (mmap, munmap, mprotect) from happening between the time HW
+    // takes the fault and we cancel it.
+    mm = uvm_va_space_mm_retain_lock(va_space);
+    va_block_context->mm = mm;
+    uvm_va_space_down_read(va_space);
+
+    // We saw fatal faults in this VA space before. Flush while holding
+    // mmap_lock to make sure those faults come back (aren't stale).
+    //
+    // We need to wait until all old fault messages have arrived before
+    // flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
+    status = fault_buffer_flush_locked(gpu,
+                                       UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
+                                       UVM_FAULT_REPLAY_TYPE_START,
+                                       batch_context);
+    if (status != NV_OK)
+        goto done;
+
+    // Wait for the flush's replay to finish to give the legitimate faults a
+    // chance to show up in the buffer again.
+    status = uvm_tracker_wait(&replayable_faults->replay_tracker);
+    if (status != NV_OK)
+        goto done;
+
+    // We expect all replayed faults to have arrived in the buffer so we can re-
+    // service them. The replay-and-wait sequence above will ensure they're all
+    // in the HW buffer. When GSP owns the HW buffer, we also have to wait for
+    // GSP to copy all available faults from the HW buffer into the shadow
+    // buffer.
+    //
+    // TODO: Bug 2533557: This flush does not actually guarantee that GSP will
+    //       copy over all faults.
+    status = hw_fault_buffer_flush_locked(gpu->parent);
+    if (status != NV_OK)
+        goto done;
+
+    // If there is no GPU VA space for the GPU, ignore all faults in the VA
+    // space. This can happen if the GPU VA space has been destroyed since we
+    // unlocked the VA space in service_fault_batch. That means the fatal faults
+    // are stale, because unregistering the GPU VA space requires preempting the
+    // context and detaching all channels in that VA space. Restart fault
+    // servicing from the top.
+    gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
+    if (!gpu_va_space)
+        goto done;
+
+    // Re-parse the new faults
+    batch_context->num_invalid_prefetch_faults = 0;
+    batch_context->num_duplicate_faults        = 0;
+    batch_context->num_replays                 = 0;
+    batch_context->fatal_va_space              = NULL;
+    batch_context->has_throttled_faults        = false;
+
+    status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
+    if (status != NV_OK)
+        goto done;
+
+    // No more faults left. Either the previously-seen fatal entry was stale, or
+    // RM killed the context underneath us.
+    if (batch_context->num_cached_faults == 0)
+        goto done;
+
+    ++batch_context->batch_id;
+
+    status = preprocess_fault_batch(gpu, batch_context);
+    if (status != NV_OK) {
+        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
+            // Another flush happened due to stale faults or a context-fatal
+            // error. The previously-seen fatal fault might not exist anymore,
+            // so restart fault servicing from the top.
+            status = NV_OK;
+        }
+
+        goto done;
+    }
+
+    // Search for the target VA space
+    for (i = 0; i < batch_context->num_coalesced_faults; i++) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+        UVM_ASSERT(current_entry->va_space);
+        if (current_entry->va_space == va_space)
+            break;
+    }
+
+    while (i < batch_context->num_coalesced_faults) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+
+        if (current_entry->va_space != va_space)
+            break;
+
+        // service_fault_batch_dispatch() doesn't expect unserviceable faults.
+        // Just cancel them directly.
+        if (current_entry->is_fatal) {
+            status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
+            if (status != NV_OK)
+                break;
+
+            ++i;
+        }
+        else {
+            uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
+            NvU32 block_faults;
+
+            ats_invalidate->tlb_batch_pending = false;
+            uvm_hmm_service_context_init(service_context);
+
+            // Service all the faults that we can. We only really need to search
+            // for fatal faults, but attempting to service all is the easiest
+            // way to do that.
+            status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false);
+            if (status != NV_OK) {
+                // TODO: Bug 3900733: clean up locking in service_fault_batch().
+                // We need to drop lock and retry. That means flushing and
+                // starting over.
+                if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
+                    status = NV_OK;
+
+                break;
+            }
+
+            // Invalidate TLBs before cancel to ensure that fatal faults don't
+            // get stuck in HW behind non-fatal faults to the same line.
+            status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
+            if (status != NV_OK)
+                break;
+
+            while (block_faults-- > 0) {
+                current_entry = batch_context->ordered_fault_cache[i];
+                if (current_entry->is_fatal) {
+                    status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
+                    if (status != NV_OK)
+                        break;
+                }
+
+                ++i;
+            }
+        }
+    }
+
+done:
+    uvm_va_space_up_read(va_space);
+    uvm_va_space_mm_release_unlock(va_space, mm);
+
+    if (status == NV_OK) {
+        // There are two reasons to flush the fault buffer here.
+        //
+        // 1) Functional. We need to replay both the serviced non-fatal faults
+        //    and the skipped faults in other VA spaces. The former need to be
+        //    restarted and the latter need to be replayed so the normal fault
+        //    service mechanism can fetch and process them.
+        //
+        // 2) Performance. After cancelling the fatal faults, a flush removes
+        //    any potential duplicated fault that may have been added while
+        //    processing the faults in this batch. This flush also avoids doing
+        //    unnecessary processing after the fatal faults have been cancelled,
+        //    so all the rest are unlikely to remain after a replay because the
+        //    context is probably in the process of dying.
+        status = fault_buffer_flush_locked(gpu,
+                                           UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
+                                           UVM_FAULT_REPLAY_TYPE_START,
+                                           batch_context);
+    }
+
+    return status;
+}
 // Scan the ordered view of faults and group them by different va_blocks
 // (managed faults) and service faults for each va_block, in batch.
 // Service non-managed faults one at a time as they are encountered during the
 // scan.
 //
-// This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
-// was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
-// space
+// Fatal faults are marked for later processing by the caller.
 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
                                     fault_service_mode_t service_mode,
                                     uvm_fault_service_batch_context_t *batch_context)
@@ -1960,7 +2153,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,

    UVM_ASSERT(gpu->parent->replayable_faults_supported);

-    ats_invalidate->write_faults_in_batch = false;
+    ats_invalidate->tlb_batch_pending = false;
    uvm_hmm_service_context_init(service_context);

    for (i = 0; i < batch_context->num_coalesced_faults;) {
@@ -1995,38 +2188,25 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            va_block_context->mm = mm;

            uvm_va_space_down_read(va_space);
-
            gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
-            if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
-                status = fault_buffer_flush_locked(gpu,
-                                                   UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
-                                                   UVM_FAULT_REPLAY_TYPE_START,
-                                                   batch_context);
-                if (status == NV_OK)
-                    status = NV_WARN_MORE_PROCESSING_REQUIRED;
-
-                break;
-            }
-
-            // The case where there is no valid GPU VA space for the GPU in this
-            // VA space is handled next
        }

        // Some faults could be already fatal if they cannot be handled by
        // the UVM driver
        if (current_entry->is_fatal) {
            ++i;
-            batch_context->has_fatal_faults = true;
+            if (!batch_context->fatal_va_space)
+                batch_context->fatal_va_space = va_space;
+
            utlb->has_fatal_faults = true;
            UVM_ASSERT(utlb->num_pending_faults > 0);
            continue;
        }

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
+        if (!gpu_va_space) {
            // If there is no GPU VA space for the GPU, ignore the fault. This
            // can happen if a GPU VA space is destroyed without explicitly
-            // freeing all memory ranges (destroying the VA range triggers a
-            // flush of the fault buffer) and there are stale entries in the
+            // freeing all memory ranges and there are stale entries in the
            // buffer that got fixed by the servicing in a previous batch.
            ++i;
            continue;
@@ -2044,15 +2224,17 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            uvm_va_space_mm_release_unlock(va_space, mm);
            mm = NULL;
            va_space = NULL;
+            status = NV_OK;
            continue;
        }
+
        if (status != NV_OK)
            goto fail;

        i += block_faults;

        // Don't issue replays in cancel mode
-        if (replay_per_va_block && !batch_context->has_fatal_faults) {
+        if (replay_per_va_block && !batch_context->fatal_va_space) {
            status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
            if (status != NV_OK)
                goto fail;
@@ -2064,8 +2246,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
        }
    }

-    // Only clobber status if invalidate_status != NV_OK, since status may also
-    // contain NV_WARN_MORE_PROCESSING_REQUIRED.
    if (va_space != NULL) {
        NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
        if (invalidate_status != NV_OK)
@@ -2273,77 +2453,48 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
    return false;
 }

-typedef enum
-{
-    // Only cancel faults flagged as fatal
-    FAULT_CANCEL_MODE_FATAL,
-
-    // Cancel all faults in the batch unconditionally
-    FAULT_CANCEL_MODE_ALL,
-} fault_cancel_mode_t;
-
-// Cancel faults in the given fault service batch context. The function provides
-// two different modes depending on the value of cancel_mode:
-// - If cancel_mode == FAULT_CANCEL_MODE_FATAL, only faults flagged as fatal
-// will be cancelled. In this case, the reason reported to tools is the one
-// contained in the fault entry itself.
-// - If cancel_mode == FAULT_CANCEL_MODE_ALL, all faults will be cancelled
-// unconditionally. In this case, the reason reported to tools for non-fatal
-// faults is the one passed to this function.
-static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
-                                          uvm_fault_service_batch_context_t *batch_context,
-                                          fault_cancel_mode_t cancel_mode,
-                                          UvmEventFatalReason reason)
+// Cancel all faults in the given fault service batch context, even those not
+// marked as fatal.
+static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
+                                   uvm_fault_service_batch_context_t *batch_context,
+                                   UvmEventFatalReason reason)
 {
    NV_STATUS status = NV_OK;
    NV_STATUS fault_status;
-    uvm_va_space_t *va_space = NULL;
-    NvU32 i;
+    NvU32 i = 0;

    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
-    if (cancel_mode == FAULT_CANCEL_MODE_ALL)
-        UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
+    UVM_ASSERT(reason != UvmEventFatalReasonInvalid);

-    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
+    while (i < batch_context->num_coalesced_faults && status == NV_OK) {
        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+        uvm_va_space_t *va_space = current_entry->va_space;
+        bool skip_va_space;

-        UVM_ASSERT(current_entry->va_space);
+        UVM_ASSERT(va_space);

-        if (current_entry->va_space != va_space) {
-            // Fault on a different va_space, drop the lock of the old one...
-            if (va_space != NULL)
-                uvm_va_space_up_read(va_space);
+        uvm_va_space_down_read(va_space);

-            va_space = current_entry->va_space;
+        // If there is no GPU VA space for the GPU, ignore all faults in
+        // that VA space. This can happen if the GPU VA space has been
+        // destroyed since we unlocked the VA space in service_fault_batch.
+        // Ignoring the fault avoids targetting a PDB that might have been
+        // reused by another process.
+        skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);

-            // ... and take the lock of the new one
-            uvm_va_space_down_read(va_space);
+        for (;
+             i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
+             current_entry = batch_context->ordered_fault_cache[++i]) {
+            uvm_fault_cancel_va_mode_t cancel_va_mode;

-            // We don't need to check whether a buffer flush is required
-            // (due to VA range destruction).
-            // - For cancel_mode == FAULT_CANCEL_MODE_FATAL, once a fault is
-            // flagged as fatal we need to cancel it, even if its VA range no
-            // longer exists.
-            // - For cancel_mode == FAULT_CANCEL_MODE_ALL we don't care about
-            // any of this, we just want to trigger RC in RM.
-        }
+            if (skip_va_space)
+                continue;

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
-            // If there is no GPU VA space for the GPU, ignore the fault.
-            // This can happen if the GPU VA did not exist in
-            // service_fault_batch(), or it was destroyed since then.
-            // This is to avoid targetting a PDB that might have been reused
-            // by another process.
-            continue;
-        }
-
-        // Cancel the fault
-        if (cancel_mode == FAULT_CANCEL_MODE_ALL || current_entry->is_fatal) {
-            uvm_fault_cancel_va_mode_t cancel_va_mode = current_entry->replayable.cancel_va_mode;
-
-            // If cancelling unconditionally and the fault was not fatal,
-            // set the cancel reason passed to this function
-            if (!current_entry->is_fatal) {
+            if (current_entry->is_fatal) {
+                UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
+                cancel_va_mode = current_entry->replayable.cancel_va_mode;
+            }
+            else {
                current_entry->fatal_reason = reason;
                cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
            }
@@ -2352,17 +2503,13 @@ static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
            if (status != NV_OK)
                break;
        }
+
+        uvm_va_space_up_read(va_space);
    }

-    if (va_space != NULL)
-        uvm_va_space_up_read(va_space);
-
-    // After cancelling the fatal faults, the fault buffer is flushed to remove
-    // any potential duplicated fault that may have been added while processing
-    // the faults in this batch. This flush also avoids doing unnecessary
-    // processing after the fatal faults have been cancelled, so all the rest
-    // are unlikely to remain after a replay because the context is probably in
-    // the process of dying.
+    // Because each cancel itself triggers a replay, there may be a large number
+    // of new duplicated faults in the buffer after cancelling all the known
+    // ones. Flushing the buffer discards them to avoid unnecessary processing.
    fault_status = fault_buffer_flush_locked(gpu,
                                             UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
                                             UVM_FAULT_REPLAY_TYPE_START,
@@ -2410,12 +2557,12 @@ static void cancel_fault_batch(uvm_gpu_t *gpu,
                               uvm_fault_service_batch_context_t *batch_context,
                               UvmEventFatalReason reason)
 {
-    if (gpu->parent->fault_cancel_va_supported) {
-        cancel_faults_precise_va(gpu, batch_context, FAULT_CANCEL_MODE_ALL, reason);
-        return;
-    }
-
-    cancel_fault_batch_tlb(gpu, batch_context, reason);
+    // Return code is ignored since we're on a global error path and wouldn't be
+    // able to recover anyway.
+    if (gpu->parent->fault_cancel_va_supported)
+        cancel_faults_all(gpu, batch_context, reason);
+    else
+        cancel_fault_batch_tlb(gpu, batch_context, reason);
 }


@@ -2502,7 +2649,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        // 5) Fetch all faults from buffer
@@ -2549,9 +2696,6 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
        // 8) Service all non-fatal faults and mark all non-serviceable faults
        // as fatal
        status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        UVM_ASSERT(batch_context->num_replays == 0);
        if (status == NV_ERR_NO_MEMORY)
            continue;
@@ -2559,7 +2703,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
            break;

        // No more fatal faults left, we are done
-        if (!batch_context->has_fatal_faults)
+        if (!batch_context->fatal_va_space)
            break;

        // 9) Search for uTLBs that contain fatal faults and meet the
@@ -2581,13 +2725,9 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
 {
-    UVM_ASSERT(batch_context->has_fatal_faults);
-    if (gpu->parent->fault_cancel_va_supported) {
-        return cancel_faults_precise_va(gpu,
-                                        batch_context,
-                                        FAULT_CANCEL_MODE_FATAL,
-                                        UvmEventFatalReasonInvalid);
-    }
+    UVM_ASSERT(batch_context->fatal_va_space);
+    if (gpu->parent->fault_cancel_va_supported)
+        return service_fault_batch_for_cancel(gpu, batch_context);

    return cancel_faults_precise_tlb(gpu, batch_context);
 }
@@ -2643,7 +2783,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_duplicate_faults        = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
@@ -2671,9 +2811,6 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        // was flushed
        num_replays += batch_context->num_replays;

-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        enable_disable_prefetch_faults(gpu->parent, batch_context);

        if (status != NV_OK) {
@@ -2687,10 +2824,17 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
            break;
        }

-        if (batch_context->has_fatal_faults) {
+        if (batch_context->fatal_va_space) {
            status = uvm_tracker_wait(&batch_context->tracker);
-            if (status == NV_OK)
+            if (status == NV_OK) {
                status = cancel_faults_precise(gpu, batch_context);
+                if (status == NV_OK) {
+                    // Cancel handling should've issued at least one replay
+                    UVM_ASSERT(batch_context->num_replays > 0);
+                    ++num_batches;
+                    continue;
+                }
+            }

            break;
        }
--- a/kernel-open/nvidia-uvm/uvm_hopper.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper.c
@@ -103,5 +103,7 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->map_remap_larger_page_promotion = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = true;
 }

--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2022 NVIDIA Corporation
+    Copyright (c) 2020-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -33,6 +33,7 @@

 #include "uvm_types.h"
 #include "uvm_global.h"
+#include "uvm_common.h"
 #include "uvm_hal.h"
 #include "uvm_hal_types.h"
 #include "uvm_hopper_fault_buffer.h"
@@ -42,6 +43,10 @@
 #define MMU_BIG 0
 #define MMU_SMALL 1

+// Used in pde_pcf().
+#define ATS_ALLOWED 0
+#define ATS_NOT_ALLOWED 1
+
 uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
 {
    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST44)
@@ -260,7 +265,108 @@ static NvU64 poisoned_pte_hopper(void)
    return WRITE_HWCONST64(pte_bits, _MMU_VER3, PTE, PCF, PRIVILEGE_RO_NO_ATOMIC_UNCACHED_ACD);
 }

-static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 depth)
+typedef enum
+{
+    PDE_TYPE_SINGLE,
+    PDE_TYPE_DUAL_BIG,
+    PDE_TYPE_DUAL_SMALL,
+    PDE_TYPE_COUNT,
+} pde_type_t;
+
+static const NvU8 valid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_ALLOWED,
+                                       NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_NOT_ALLOWED },
+                                     { NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_ALLOWED,
+                                       NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_NOT_ALLOWED },
+                                     { NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_ALLOWED,
+                                       NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_NOT_ALLOWED } };
+
+static const NvU8 invalid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_INVALID_ATS_ALLOWED,
+                                         NV_MMU_VER3_PDE_PCF_INVALID_ATS_NOT_ALLOWED },
+                                       { NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_ALLOWED,
+                                         NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_NOT_ALLOWED },
+                                       { NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_ALLOWED,
+                                         NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_NOT_ALLOWED } };
+
+static const NvU8 va_base[] = { 56, 47, 38, 29, 21 };
+
+static bool is_ats_range_valid(uvm_page_directory_t *dir, NvU32 child_index)
+{
+    NvU64 pde_base_va;
+    NvU64 min_va_upper;
+    NvU64 max_va_lower;
+    NvU32 index_in_dir;
+
+    uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
+
+    UVM_ASSERT(dir->depth < ARRAY_SIZE(va_base));
+
+    // We can use UVM_PAGE_SIZE_AGNOSTIC because page_size is only used in
+    // index_bits_hopper() for PTE table, i.e., depth 5+, which does not use a
+    // PDE PCF or an ATS_ALLOWED/NOT_ALLOWED setting.
+    UVM_ASSERT(child_index < (1ull << index_bits_hopper(dir->depth, UVM_PAGE_SIZE_AGNOSTIC)));
+
+    pde_base_va = 0;
+    index_in_dir = child_index;
+    while (dir) {
+        pde_base_va += index_in_dir * (1ull << va_base[dir->depth]);
+        index_in_dir = dir->index_in_parent;
+        dir = dir->host_parent;
+    }
+    pde_base_va = (NvU64)((NvS64)(pde_base_va << (64 - num_va_bits_hopper())) >> (64 - num_va_bits_hopper()));
+
+    if (pde_base_va < max_va_lower || pde_base_va >= min_va_upper)
+        return true;
+
+    return false;
+}
+
+// PDE Permission Control Flags
+static NvU32 pde_pcf(bool valid, pde_type_t pde_type, uvm_page_directory_t *dir, NvU32 child_index)
+{
+    const NvU8 (*pcf)[2] = valid ? valid_pcf : invalid_pcf;
+    NvU8 depth = dir->depth;
+
+    UVM_ASSERT(pde_type < PDE_TYPE_COUNT);
+    UVM_ASSERT(depth < 5);
+
+    // On non-ATS systems, PDE PCF only sets the valid and volatile/cache bits.
+    if (!g_uvm_global.ats.enabled)
+        return pcf[pde_type][ATS_ALLOWED];
+
+    // We assume all supported ATS platforms use canonical form address.
+    // See comments in uvm_gpu.c:uvm_gpu_can_address() and in
+    // uvm_mmu.c:page_tree_ats_init();
+    UVM_ASSERT(uvm_platform_uses_canonical_form_address());
+
+    // Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
+    // ATS and GMMU page tables. For managed memory we need to prevent this
+    // parallel lookup since we would not get any GPU fault if the CPU has
+    // a valid mapping. Also, for external ranges that are known to be
+    // mapped entirely on the GMMU page table we can skip the ATS lookup
+    // for performance reasons. Parallel ATS lookup is disabled in PDE1
+    // (depth 3) and, therefore, it applies to the underlying 512MB VA
+    // range.
+    //
+    // UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
+    // This is fine because CUDA ensures that all managed and external
+    // allocations are properly compartmentalized in 512MB-aligned VA
+    // regions. For cudaHostRegister CUDA cannot control the VA range, but
+    // we rely on ATS for those allocations so they can't choose the
+    // ATS_NOT_ALLOWED mode.
+    // TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range to
+    //                    PTEs.
+    // HW complies with the leaf PDE's ATS_ALLOWED/ATS_NOT_ALLOWED settings,
+    // enabling us to treat any upper-level PDE as a don't care as long as there
+    // are leaf PDEs for the entire upper-level PDE range. We assume PDE4
+    // entries (depth == 0) are always ATS enabled, and the no_ats_range is in
+    // PDE3 or lower.
+    if (depth == 0 || (!valid && is_ats_range_valid(dir, child_index)))
+        return pcf[pde_type][ATS_ALLOWED];
+
+    return pcf[pde_type][ATS_NOT_ALLOWED];
+}
+
+static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
 {
    NvU64 pde_bits = 0;

@@ -280,38 +386,17 @@ static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 dep
                break;
        }

-        // PCF (permission control flags) 5:3
-        // Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
-        // ATS and GMMU page tables. For managed memory we need to prevent this
-        // parallel lookup since we would not get any GPU fault if the CPU has
-        // a valid mapping. Also, for external ranges that are known to be
-        // mapped entirely on the GMMU page table we can skip the ATS lookup
-        // for performance reasons. Parallel ATS lookup is disabled in PDE1
-        // (depth 3) and, therefore, it applies to the underlying 512MB VA
-        // range.
-        //
-        // UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
-        // This is fine because CUDA ensures that all managed and external
-        // allocations are properly compartmentalized in 512MB-aligned VA
-        // regions. For cudaHostRegister CUDA cannot control the VA range, but
-        // we rely on ATS for those allocations so they can't choose the
-        // ATS_NOT_ALLOWED mode.
-        //
-        // TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range
-        // to PTEs.
-        if (depth == 3 && g_uvm_global.ats.enabled)
-            pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_NOT_ALLOWED);
-        else
-            pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_ALLOWED);
-
        // address 51:12
        pde_bits |= HWVALUE64(_MMU_VER3, PDE, ADDRESS, address);
    }

+    // PCF (permission control flags) 5:3
+    pde_bits |= HWVALUE64(_MMU_VER3, PDE, PCF, pde_pcf(phys_alloc != NULL, PDE_TYPE_SINGLE, dir, child_index));
+
    return pde_bits;
 }

-static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
+static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
 {
    NvU64 pde_bits = 0;

@@ -330,17 +415,20 @@ static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
                break;
        }

-        // PCF (permission control flags) 5:3
-        pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_BIG, VALID_UNCACHED_ATS_NOT_ALLOWED);
-
        // address 51:8
        pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_BIG, address);
    }

+    // PCF (permission control flags) 5:3
+    pde_bits |= HWVALUE64(_MMU_VER3,
+                          DUAL_PDE,
+                          PCF_BIG,
+                          pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_BIG, dir, child_index));
+
    return pde_bits;
 }

-static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
+static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
 {
    NvU64 pde_bits = 0;

@@ -359,29 +447,40 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
                break;
        }

-        // PCF (permission control flags) 69:67 [5:3]
-        pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_SMALL, VALID_UNCACHED_ATS_NOT_ALLOWED);
-
        // address 115:76 [51:12]
        pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_SMALL, address);
    }
+
+    // PCF (permission control flags) 69:67 [5:3]
+    pde_bits |= HWVALUE64(_MMU_VER3,
+                          DUAL_PDE,
+                          PCF_SMALL,
+                          pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_SMALL, dir, child_index));
+
    return pde_bits;
 }

-static void make_pde_hopper(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_hopper(void *entry,
+                            uvm_mmu_page_table_alloc_t **phys_allocs,
+                            uvm_page_directory_t *dir,
+                            NvU32 child_index)
 {
-    NvU32 entry_count = entries_per_index_hopper(depth);
+    NvU32 entry_count;
    NvU64 *entry_bits = (NvU64 *)entry;

+    UVM_ASSERT(dir);
+
+    entry_count = entries_per_index_hopper(dir->depth);
+
    if (entry_count == 1) {
-        *entry_bits = single_pde_hopper(*phys_allocs, depth);
+        *entry_bits = single_pde_hopper(*phys_allocs, dir, child_index);
    }
    else if (entry_count == 2) {
-        entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG]);
-        entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL]);
+        entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG], dir, child_index);
+        entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL], dir, child_index);

        // This entry applies to the whole dual PDE but is stored in the lower
-        // bits
+        // bits.
        entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER3, DUAL_PDE, IS_PTE, FALSE);
    }
    else {
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@@ -128,8 +128,9 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 // present if we see the callback.
 //
 // The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
-// v3.19 (2014-11-13).
-    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
+// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
+    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
+        defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
        #define UVM_CAN_USE_MMU_NOTIFIERS() 1
    #else
        #define UVM_CAN_USE_MMU_NOTIFIERS() 0
@@ -153,10 +154,6 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 #define VM_MIXEDMAP    0x00000000
 #endif

-#if !defined(MPOL_PREFERRED_MANY)
-#define MPOL_PREFERRED_MANY    5
-#endif
-
 //
 // printk.h already defined pr_fmt, so we have to redefine it so the pr_*
 // routines pick up our version
--- a/kernel-open/nvidia-uvm/uvm_maxwell.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -71,4 +71,6 @@ void uvm_hal_maxwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = false;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -106,10 +106,16 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_maxwell(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_maxwell(void *entry,
+                             uvm_mmu_page_table_alloc_t **phys_allocs,
+                             uvm_page_directory_t *dir,
+                             NvU32 child_index)
 {
    NvU64 pde_bits = 0;
-    UVM_ASSERT(depth == 0);
+
+    UVM_ASSERT(dir);
+    UVM_ASSERT(dir->depth == 0);
+
    pde_bits |= HWCONST64(_MMU, PDE, SIZE, FULL);
    pde_bits |= big_half_pde_maxwell(phys_allocs[MMU_BIG]) | small_half_pde_maxwell(phys_allocs[MMU_SMALL]);

--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@@ -672,14 +672,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
        .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
    };

-    // WAR for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // This code path isn't used on GH180 but we need to maintain consistent
-    // behaviour on systems that do.
-    if (!vma_is_anonymous(args->vma))
-        return NV_WARN_NOTHING_TO_DO;
-
    ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
    if (ret < 0)
        return errno_to_nv_status(ret);
@@ -693,24 +685,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
    if (ret < 0)
        return errno_to_nv_status(ret);

-    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
-    //       support for it is added to the Linux kernel
-    //
-    // A side-effect of migrate_vma_setup() is it calls mmu notifiers even if a
-    // page can't be migrated (eg. because it's a non-anonymous mapping). We
-    // need this side-effect for SMMU on GH180 to ensure any cached read-only
-    // entries are flushed from SMMU on permission upgrade.
-    //
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // The above WAR doesn't work for HugeTLBfs mappings because
-    // migrate_vma_setup() will fail in that case.
-    if (!vma_is_anonymous(args->vma)) {
-        migrate_vma_finalize(args);
-        return NV_WARN_NOTHING_TO_DO;
-    }
-
    uvm_migrate_vma_alloc_and_copy(args, state);
    if (state->status == NV_OK) {
        migrate_vma_pages(args);
@@ -862,6 +836,17 @@ static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma,
    return NV_OK;
 }

+NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
+{
+    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+
+    uvm_va_space_down_write(va_space);
+    va_space->test.skip_migrate_vma = params->skip;
+    uvm_va_space_up_write(va_space);
+
+    return NV_OK;
+}
+
 static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
                                      unsigned long start,
                                      unsigned long outer,
@@ -884,13 +869,12 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
    start = max(start, vma->vm_start);
    outer = min(outer, vma->vm_end);

-    // migrate_vma only supports anonymous VMAs. We check for those after
-    // calling migrate_vma_setup() to workaround Bug 4130089. We need to check
-    // for HugeTLB VMAs here because migrate_vma_setup() will return a fatal
-    // error for those.
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    if (is_vm_hugetlb_page(vma))
+    if (va_space->test.skip_migrate_vma)
+        return NV_WARN_NOTHING_TO_DO;
+
+    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
+    //       support for it is added to the Linux kernel
+    if (!vma_is_anonymous(vma))
        return NV_WARN_NOTHING_TO_DO;

    if (uvm_processor_mask_empty(&va_space->registered_gpus))
@@ -950,7 +934,9 @@ static NV_STATUS migrate_pageable(migrate_vma_state_t *state)
            bool touch = uvm_migrate_args->touch;
            uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions;

-            UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus));
+            UVM_ASSERT(va_space->test.skip_migrate_vma ||
+                       !vma_is_anonymous(vma) ||
+                       uvm_processor_mask_empty(&va_space->registered_gpus));

            // We can't use migrate_vma to move the pages as desired. Normally
            // this fallback path is supposed to populate the memory then inform
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@@ -51,7 +51,7 @@ typedef struct
 #if defined(CONFIG_MIGRATE_VMA_HELPER)
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #else
-#if defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_VMA_SETUP_PRESENT)
+#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #endif
 #endif
@@ -218,6 +218,9 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args);
 NV_STATUS uvm_migrate_pageable_init(void);

 void uvm_migrate_pageable_exit(void);
+
+NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp);
+
 #else // UVM_MIGRATE_VMA_SUPPORTED

 static NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
@@ -251,6 +254,10 @@ static void uvm_migrate_pageable_exit(void)
 {
 }

+static inline NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
+{
+    return NV_OK;
+}
 #endif // UVM_MIGRATE_VMA_SUPPORTED

 #endif
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -323,37 +323,156 @@ static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
    uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
 }

+static void pde_fill_cpu(uvm_page_tree_t *tree,
+                         uvm_page_directory_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr)
+{
+    NvU64 pde_data[2], entry_size;
+    NvU32 i;
+
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(directory->depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    for (i = 0; i < pde_count; i++) {
+        tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i);
+
+        if (entry_size == sizeof(pde_data[0]))
+            uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
+        else
+            uvm_mmu_page_table_cpu_memset_16(tree->gpu, &directory->phys_alloc, start_index + i, pde_data, 1);
+    }
+}
+
+static void pde_fill_gpu(uvm_page_tree_t *tree,
+                         uvm_page_directory_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr,
+                         uvm_push_t *push)
+{
+    NvU64 pde_data[2], entry_size;
+    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->phys_alloc.addr);
+    NvU32 max_inline_entries;
+    uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
+    uvm_gpu_address_t inline_data_addr;
+    uvm_push_inline_data_t inline_data;
+    NvU32 entry_count, i, j;
+
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(directory->depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / entry_size;
+
+    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
+    else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
+
+    pde_entry_addr.address += start_index * entry_size;
+
+    for (i = 0; i < pde_count;) {
+        // All but the first memory operation can be pipelined. We respect the
+        // caller's pipelining settings for the first push.
+        if (i != 0)
+            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
+
+        entry_count = min(pde_count - i, max_inline_entries);
+
+        // No membar is needed until the last memory operation. Otherwise,
+        // use caller's membar flag.
+        if ((i + entry_count) < pde_count)
+            uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+        else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
+            uvm_push_set_flag(push, push_membar_flag);
+
+        uvm_push_inline_data_begin(push, &inline_data);
+        for (j = 0; j < entry_count; j++) {
+            tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i + j);
+            uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
+        }
+        inline_data_addr = uvm_push_inline_data_end(&inline_data);
+
+        tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * entry_size);
+
+        i += entry_count;
+        pde_entry_addr.address += entry_size * entry_count;
+    }
+}
+
+// pde_fill() populates pde_count PDE entries (starting at start_index) with
+// the same mapping, i.e., with the same physical address (phys_addr).
+// pde_fill() is optimized for pde_count == 1, which is the common case.
+static void pde_fill(uvm_page_tree_t *tree,
+                     uvm_page_directory_t *directory,
+                     NvU32 start_index,
+                     NvU32 pde_count,
+                     uvm_mmu_page_table_alloc_t **phys_addr,
+                     uvm_push_t *push)
+{
+    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, directory->depth, UVM_PAGE_SIZE_AGNOSTIC));
+
+    if (push)
+        pde_fill_gpu(tree, directory, start_index, pde_count, phys_addr, push);
+    else
+        pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
+}
+
 static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
-    NvU64 clear_bits[2];
-    uvm_mmu_mode_hal_t *hal = tree->hal;
+    NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
+    NvU8 max_pde_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC) - 1;

-    if (dir->depth == tree->hal->page_table_depth(page_size)) {
-        *clear_bits = 0; // Invalid PTE
-    }
-    else {
-        // passing in NULL for the phys_allocs will mark the child entries as invalid
-        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
-        hal->make_pde(clear_bits, phys_allocs, dir->depth);
+    // Passing in NULL for the phys_allocs will mark the child entries as
+    // invalid.
+    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};

-        // Make sure that using only clear_bits[0] will work
-        UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
-    }
+    // Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
+    // than 512 entries. In this case, we initialize them all with the same
+    // clean PDE. ATS systems may require clean PDEs with
+    // ATS_ALLOWED/ATS_NOT_ALLOWED bit settings based on the mapping VA.
+    // We only clean_bits to 0 at the lowest page table level (PTE table), i.e.,
+    // when depth is greater than the max_pde_depth.
+    if ((dir->depth > max_pde_depth) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
+        NvU64 clear_bits[2];

-    // initialize the memory to a reasonable value
-    if (push) {
-        tree->gpu->parent->ce_hal->memset_8(push,
-                                            uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+        // If it is not a PTE, make a clean PDE.
+        if (dir->depth != tree->hal->page_table_depth(page_size)) {
+            // make_pde() child index is zero/ignored, since it is only used in
+            // PDEs on ATS-enabled systems where pde_fill() is preferred.
+            tree->hal->make_pde(clear_bits, phys_allocs, dir, 0);
+
+            // Make sure that using only clear_bits[0] will work.
+            UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
+        }
+        else {
+            *clear_bits = 0;
+        }
+
+        // Initialize the memory to a reasonable value.
+        if (push) {
+            tree->gpu->parent->ce_hal->memset_8(push,
+                                                uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+                                                *clear_bits,
+                                                dir->phys_alloc.size);
+        }
+        else {
+            uvm_mmu_page_table_cpu_memset_8(tree->gpu,
+                                            &dir->phys_alloc,
+                                            0,
                                            *clear_bits,
-                                            dir->phys_alloc.size);
+                                            dir->phys_alloc.size / sizeof(*clear_bits));
+        }
    }
    else {
-        uvm_mmu_page_table_cpu_memset_8(tree->gpu,
-                                        &dir->phys_alloc,
-                                        0,
-                                        *clear_bits,
-                                        dir->phys_alloc.size / sizeof(*clear_bits));
+        pde_fill(tree, dir, 0, entries_count, phys_allocs, push);
    }
+
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@@ -367,8 +486,10 @@ static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
    NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
    uvm_page_directory_t *dir;

-    // The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
-    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
+    // The page tree doesn't cache PTEs so space is not allocated for entries
+    // that are always PTEs.
+    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not
+    // page_size.
    if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
        entry_count = 0;
    else
@@ -409,108 +530,6 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }

-static void pde_fill_cpu(uvm_page_tree_t *tree,
-                         NvU32 depth,
-                         uvm_mmu_page_table_alloc_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr)
-{
-    NvU64 pde_data[2], entry_size;
-
-    UVM_ASSERT(uvm_mmu_use_cpu(tree));
-    entry_size = tree->hal->entry_size(depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    tree->hal->make_pde(pde_data, phys_addr, depth);
-
-    if (entry_size == sizeof(pde_data[0]))
-        uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
-    else
-        uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
-}
-
-static void pde_fill_gpu(uvm_page_tree_t *tree,
-                         NvU32 depth,
-                         uvm_mmu_page_table_alloc_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr,
-                         uvm_push_t *push)
-{
-    NvU64 pde_data[2], entry_size;
-    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
-
-    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
-
-    entry_size = tree->hal->entry_size(depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    tree->hal->make_pde(pde_data, phys_addr, depth);
-    pde_entry_addr.address += start_index * entry_size;
-
-    if (entry_size == sizeof(pde_data[0])) {
-        tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
-    }
-    else {
-        NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
-        uvm_gpu_address_t inline_data_addr;
-        uvm_push_inline_data_t inline_data;
-        NvU32 membar_flag = 0;
-        NvU32 i;
-
-        if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-            membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
-        else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-            membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
-
-        for (i = 0; i < pde_count;) {
-            NvU32 j;
-            NvU32 entry_count = min(pde_count - i, max_inline_entries);
-
-            uvm_push_inline_data_begin(push, &inline_data);
-            for (j = 0; j < entry_count; j++)
-                uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
-            inline_data_addr = uvm_push_inline_data_end(&inline_data);
-
-            // All but the first memcopy can be pipelined. We respect the
-            // caller's pipelining settings for the first push.
-            if (i != 0)
-                uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
-
-            // No membar is needed until the last copy. Otherwise, use
-            // caller's membar flag.
-            if (i + entry_count < pde_count)
-                uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-            else if (membar_flag)
-                uvm_push_set_flag(push, membar_flag);
-
-            tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
-
-            i += entry_count;
-            pde_entry_addr.address += sizeof(pde_data) * entry_count;
-        }
-    }
-}
-
-// pde_fill() populates pde_count PDE entries (starting at start_index) with
-// the same mapping, i.e., with the same physical address (phys_addr).
-static void pde_fill(uvm_page_tree_t *tree,
-                     NvU32 depth,
-                     uvm_mmu_page_table_alloc_t *directory,
-                     NvU32 start_index,
-                     NvU32 pde_count,
-                     uvm_mmu_page_table_alloc_t **phys_addr,
-                     uvm_push_t *push)
-{
-    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
-
-    if (push)
-        pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
-    else
-        pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
-}
-
 static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
                                            uvm_page_directory_t *parent,
                                            NvU32 index_in_parent)
@@ -540,7 +559,7 @@ static void pde_write(uvm_page_tree_t *tree,
            phys_allocs[i] = &entry->phys_alloc;
    }

-    pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
+    pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
 }

 static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
@@ -800,7 +819,6 @@ static void free_unused_directories(uvm_page_tree_t *tree,
            }
        }
    }
-
 }

 static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
@@ -811,10 +829,93 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm
    return phys_mem_allocate(tree, alloc_size, tree->location, UVM_PMM_ALLOC_FLAGS_EVICT, out);
 }

+static bool page_tree_ats_init_required(uvm_page_tree_t *tree)
+{
+    // We have full control of the kernel page tables mappings, no ATS address
+    // aliases is expected.
+    if (tree->type == UVM_PAGE_TREE_TYPE_KERNEL)
+        return false;
+
+    // Enable uvm_page_tree_init() from the page_tree test.
+    if (uvm_enable_builtin_tests && tree->gpu_va_space == NULL)
+        return false;
+
+    if (!tree->gpu_va_space->ats.enabled)
+        return false;
+
+    return tree->gpu->parent->no_ats_range_required;
+}
+
+static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
+{
+    NV_STATUS status;
+    NvU64 min_va_upper, max_va_lower;
+    NvU32 page_size;
+
+    if (!page_tree_ats_init_required(tree))
+        return NV_OK;
+
+    page_size = uvm_mmu_biggest_page_size(tree);
+
+    uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
+
+    // Potential violation of the UVM internal get/put_ptes contract. get_ptes()
+    // creates and initializes enough PTEs to populate all PDEs covering the
+    // no_ats_ranges. We store the no_ats_ranges in the tree, so they can be
+    // put_ptes()'ed on deinit(). It doesn't preclude the range to be used by a
+    // future get_ptes(), since we don't write to the PTEs (range->table) from
+    // the tree->no_ats_ranges.
+    //
+    // Lower half
+    status = uvm_page_tree_get_ptes(tree,
+                                    page_size,
+                                    max_va_lower,
+                                    page_size,
+                                    UVM_PMM_ALLOC_FLAGS_EVICT,
+                                    &tree->no_ats_ranges[0]);
+    if (status != NV_OK)
+        return status;
+
+    UVM_ASSERT(tree->no_ats_ranges[0].entry_count == 1);
+
+    if (uvm_platform_uses_canonical_form_address()) {
+        // Upper half
+        status = uvm_page_tree_get_ptes(tree,
+                                        page_size,
+                                        min_va_upper - page_size,
+                                        page_size,
+                                        UVM_PMM_ALLOC_FLAGS_EVICT,
+                                        &tree->no_ats_ranges[1]);
+        if (status != NV_OK)
+            return status;
+
+        UVM_ASSERT(tree->no_ats_ranges[1].entry_count == 1);
+    }
+
+    return NV_OK;
+}
+
+static void page_tree_ats_deinit(uvm_page_tree_t *tree)
+{
+    size_t i;
+
+    if (page_tree_ats_init_required(tree)) {
+        for (i = 0; i < ARRAY_SIZE(tree->no_ats_ranges); i++) {
+            if (tree->no_ats_ranges[i].entry_count)
+                uvm_page_tree_put_ptes(tree, &tree->no_ats_ranges[i]);
+        }
+
+        memset(tree->no_ats_ranges, 0, sizeof(tree->no_ats_ranges));
+    }
+}
+
 static void map_remap_deinit(uvm_page_tree_t *tree)
 {
-    if (tree->map_remap.pde0.size)
-        phys_mem_deallocate(tree, &tree->map_remap.pde0);
+    if (tree->map_remap.pde0) {
+        phys_mem_deallocate(tree, &tree->map_remap.pde0->phys_alloc);
+        uvm_kvfree(tree->map_remap.pde0);
+        tree->map_remap.pde0 = NULL;
+    }

    if (tree->map_remap.ptes_invalid_4k.size)
        phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
@@ -839,10 +940,16 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
    // PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
    // return the PTE for the get_ptes()'s caller.
    if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
-        status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
-        if (status != NV_OK)
+        tree->map_remap.pde0 = allocate_directory(tree,
+                                                  UVM_PAGE_SIZE_2M,
+                                                  tree->hal->page_table_depth(UVM_PAGE_SIZE_2M),
+                                                  UVM_PMM_ALLOC_FLAGS_EVICT);
+        if (tree->map_remap.pde0 == NULL) {
+            status = NV_ERR_NO_MEMORY;
            goto error;
+        }
    }
+
    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
    if (status != NV_OK)
        goto error;
@@ -864,22 +971,23 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
        NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
        size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
-
-        // pde0 depth equals UVM_PAGE_SIZE_2M.
-        NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
-        NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);
+        NvU32 pde0_entries = tree->map_remap.pde0->phys_alloc.size / tree->hal->entry_size(tree->map_remap.pde0->depth);

        // The big-page entry is NULL which makes it an invalid entry.
        phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;

        // By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
        // sufficient when pde0 is allocated in VIDMEM.
-        if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
+        if (tree->map_remap.pde0->phys_alloc.addr.aperture == UVM_APERTURE_VID)
            uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

+        // This is an orphan directory, make_pde() requires a directory to
+        // compute the VA. The UVM depth map_remap() operates on is not in the
+        // range make_pde() must operate. We only need to supply the fields used
+        // by make_pde() to not access invalid memory addresses.
+
        pde_fill(tree,
-                 pde0_depth,
-                 &tree->map_remap.pde0,
+                 tree->map_remap.pde0,
                 0,
                 pde0_entries,
                 (uvm_mmu_page_table_alloc_t **)&phys_allocs,
@@ -1006,11 +1114,22 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
        return status;

    phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, &push);
-    return page_tree_end_and_wait(tree, &push);
+
+    status = page_tree_end_and_wait(tree, &push);
+    if (status != NV_OK)
+        return status;
+
+    status = page_tree_ats_init(tree);
+    if (status != NV_OK)
+        return status;
+
+    return NV_OK;
 }

 void uvm_page_tree_deinit(uvm_page_tree_t *tree)
 {
+    page_tree_ats_deinit(tree);
+
    UVM_ASSERT(tree->root->ref_count == 0);

    // Take the tree lock only to avoid assertions. It is not required for
@@ -1249,7 +1368,6 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
        UVM_ASSERT(uvm_gpu_can_address_kernel(tree->gpu, start, size));

    while (true) {
-
        // index of the entry, for the first byte of the range, within its
        // containing directory
        NvU32 start_index;
@@ -1281,7 +1399,8 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
                if (dir_cache[dir->depth] == NULL) {
                    *cur_depth = dir->depth;

-                    // Undo the changes to the tree so that the dir cache remains private to the thread
+                    // Undo the changes to the tree so that the dir cache
+                    // remains private to the thread.
                    for (i = 0; i < used_count; i++)
                        host_pde_clear(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, page_size);

@@ -1332,10 +1451,9 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
    if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

-    phys_alloc[0] = &tree->map_remap.pde0;
+    phys_alloc[0] = &tree->map_remap.pde0->phys_alloc;
    pde_fill(tree,
-             range->table->depth,
-             &range->table->phys_alloc,
+             range->table,
             range->start_index,
             range->entry_count,
             (uvm_mmu_page_table_alloc_t **)&phys_alloc,
@@ -1380,7 +1498,8 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
                                  dir_cache)) == NV_ERR_MORE_PROCESSING_REQUIRED) {
        uvm_mutex_unlock(&tree->lock);

-        // try_get_ptes never needs depth 0, so store a directory at its parent's depth
+        // try_get_ptes never needs depth 0, so store a directory at its
+        // parent's depth.
        // TODO: Bug 1766655: Allocate everything below cur_depth instead of
        //       retrying for every level.
        dir_cache[cur_depth] = allocate_directory(tree, page_size, cur_depth + 1, pmm_flags);
@@ -1663,8 +1782,12 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
                                              range);
        if (status != NV_OK) {
            UVM_ERR_PRINT("Failed to get PTEs for subrange %zd [0x%llx, 0x%llx) size 0x%llx, part of [0x%llx, 0x%llx)\n",
-                    i, range_start, range_start + range_size, range_size,
-                    start, size);
+                          i,
+                          range_start,
+                          range_start + range_size,
+                          range_size,
+                          start,
+                          size);
            goto out;
        }
    }
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -215,11 +215,14 @@ struct uvm_mmu_mode_hal_struct
    // memory out-of-range error so we can immediately identify bad PTE usage.
    NvU64 (*poisoned_pte)(void);

-    // write a PDE bit-pattern to entry based on the data in entries (which may
+    // Write a PDE bit-pattern to entry based on the data in allocs (which may
    // point to two items for dual PDEs).
-    // any of allocs are allowed to be NULL, in which case they are to be
-    // treated as empty.
-    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth);
+    // Any of allocs are allowed to be NULL, in which case they are to be
+    // treated as empty. make_pde() uses dir and child_index to compute the
+    // mapping PDE VA. On ATS-enabled systems, we may set PDE's PCF as
+    // ATS_ALLOWED or ATS_NOT_ALLOWED based on the mapping PDE VA, even for
+    // invalid/clean PDE entries.
+    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, uvm_page_directory_t *dir, NvU32 child_index);

    // size of an entry in a directory/table.  Generally either 8 or 16 bytes.
    // (in the case of Pascal dual PDEs)
@@ -229,7 +232,7 @@ struct uvm_mmu_mode_hal_struct
    NvU32 (*entries_per_index)(NvU32 depth);

    // For dual PDEs, this is ether 1 or 0, depending on the page size.
-    // This is used to index the host copy only.  GPU PDEs are always entirely
+    // This is used to index the host copy only. GPU PDEs are always entirely
    // re-written using make_pde.
    NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);

@@ -295,11 +298,16 @@ struct uvm_page_tree_struct

        // PDE0 where all big-page entries are invalid, and small-page entries
        // point to ptes_invalid_4k.
-        // pde0 is only used on Pascal-Ampere, i.e., they have the same PDE
-        // format.
-        uvm_mmu_page_table_alloc_t pde0;
+        // pde0 is used on Pascal+ GPUs, i.e., they have the same PDE format.
+        uvm_page_directory_t *pde0;
    } map_remap;

+    // On ATS-enabled systems where the CPU VA width is smaller than the GPU VA
+    // width, the excess address range is set with ATS_NOT_ALLOWED on all  leaf
+    // PDEs covering that range. We have at most 2 no_ats_ranges, due to
+    // canonical form address systems.
+    uvm_page_table_range_t no_ats_ranges[2];
+
    // Tracker for all GPU operations on the tree
    uvm_tracker_t tracker;
 };
@@ -365,21 +373,32 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 // the same page size without an intervening put_ptes. To duplicate a subset of
 // an existing range or change the size of an existing range, use
 // uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
-NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
+                                 NvU32 page_size,
+                                 NvU64 start,
+                                 NvLength size,
+                                 uvm_pmm_alloc_flags_t pmm_flags,
+                                 uvm_page_table_range_t *range);

 // Same as uvm_page_tree_get_ptes(), but doesn't synchronize the GPU work.
 //
 // All pending operations can be waited on with uvm_page_tree_wait().
-NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
+                                       NvU32 page_size,
+                                       NvU64 start,
+                                       NvLength size,
+                                       uvm_pmm_alloc_flags_t pmm_flags,
+                                       uvm_page_table_range_t *range);

 // Returns a single-entry page table range for the addresses passed.
 // The size parameter must be a page size supported by this tree.
 // This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
 // page_size.
-NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *single);
+NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
+                                  NvU32 page_size,
+                                  NvU64 start,
+                                  uvm_pmm_alloc_flags_t pmm_flags,
+                                  uvm_page_table_range_t *single);

 // For a single-entry page table range, write the PDE (which could be a dual
 // PDE) to the GPU.
@@ -478,8 +497,8 @@ NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
 // new_range_vec will contain the upper portion of range_vec, starting at
 // new_end + 1.
 //
-// new_end + 1 is required to be within the address range of range_vec and be aligned to
-// range_vec's page_size.
+// new_end + 1 is required to be within the address range of range_vec and be
+// aligned to range_vec's page_size.
 //
 // On failure, the original range vector is left unmodified.
 NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
@@ -501,18 +520,22 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec);
 // for each offset.
 // The caller_data pointer is what the caller passed in as caller_data to
 // uvm_page_table_range_vec_write_ptes().
-typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec, NvU64 offset,
-        void *caller_data);
+typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec,
+                                                  NvU64 offset,
+                                                  void *caller_data);

-// Write all PTEs covered by the range vector using the given PTE making function.
+// Write all PTEs covered by the range vector using the given PTE making
+// function.
 //
 // After writing all the PTEs a TLB invalidate operation is performed including
 // the passed in tlb_membar.
 //
 // See comments about uvm_page_table_range_pte_maker_t for details about the
 // PTE making callback.
-NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar,
-        uvm_page_table_range_pte_maker_t pte_maker, void *caller_data);
+NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
+                                              uvm_membar_t tlb_membar,
+                                              uvm_page_table_range_pte_maker_t pte_maker,
+                                              void *caller_data);

 // Set all PTEs covered by the range vector to an empty PTE
 //
@@ -636,8 +659,9 @@ static NvU64 uvm_page_table_range_size(uvm_page_table_range_t *range)

 // Get the physical address of the entry at entry_index within the range
 // (counted from range->start_index).
-static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree, uvm_page_table_range_t *range,
-        size_t entry_index)
+static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree,
+                                                                 uvm_page_table_range_t *range,
+                                                                 size_t entry_index)
 {
    NvU32 entry_size = uvm_mmu_pte_size(tree, range->page_size);
    uvm_gpu_phys_address_t entry = range->table->phys_alloc.addr;
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -146,9 +146,15 @@ static void fake_tlb_invals_disable(void)
    g_fake_tlb_invals_tracking_enabled = false;
 }

-// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
-static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
-        NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
+// Fake TLB invalidate VA that just saves off the parameters so that they can be
+// verified later.
+static void fake_tlb_invalidate_va(uvm_push_t *push,
+                                   uvm_gpu_phys_address_t pdb,
+                                   NvU32 depth,
+                                   NvU64 base,
+                                   NvU64 size,
+                                   NvU32 page_size,
+                                   uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
        return;
@@ -210,8 +216,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
    }
    if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
        UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
-                expected_membar ? "a" : "no",
-                uvm_membar_string(g_last_fake_inval->membar));
+                       expected_membar ? "a" : "no",
+                       uvm_membar_string(g_last_fake_inval->membar));
        result = false;
    }

@@ -230,7 +236,8 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
    }
    if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
-                g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
+                       g_last_fake_inval->base,
+                       g_last_fake_inval->base + g_last_fake_inval->size);
        return false;
    }
    if (g_last_fake_inval->depth != expected_depth) {
@@ -247,15 +254,16 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
-        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
-                base, base + size);
+        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
        return false;
    }

    if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
-                base, base + size,
-                inval->base, inval->base + inval->size);
+                        base,
+                        base + size,
+                        inval->base,
+                        inval->base + inval->size);
        return false;
    }
    if (inval->depth != expected_depth) {
@@ -270,7 +278,13 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    return true;
 }

-static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
+static bool assert_invalidate_range(NvU64 base,
+                                    NvU64 size,
+                                    NvU32 page_size,
+                                    bool allow_inval_all,
+                                    NvU32 range_depth,
+                                    NvU32 all_depth,
+                                    bool expected_membar)
 {
    NvU32 i;

@@ -488,7 +502,6 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
    return NV_OK;
 }

-
 static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -842,6 +855,7 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
    TEST_CHECK_RET(range2.entry_count == 256);
    TEST_CHECK_RET(range2.table->ref_count == 512);
    TEST_CHECK_RET(range1.table == range2.table);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range1.start_index == 256);
@@ -871,6 +885,7 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
    TEST_CHECK_RET(range64k.entry_count == 16);
    TEST_CHECK_RET(range64k.table->ref_count == 16);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range64k.start_index == 16);
@@ -1030,10 +1045,13 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)

    // Depth 4
    NvU64 extent_pte = UVM_PAGE_SIZE_2M;
+
    // Depth 3
    NvU64 extent_pde0 = extent_pte * (1ull << 8);
+
    // Depth 2
    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
+
    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

@@ -1081,7 +1099,11 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
+static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
+                                                 NvU64 base,
+                                                 NvU64 size,
+                                                 NvU32 min_page_size,
+                                                 NvU32 max_page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_push_t push;
@@ -1205,7 +1227,11 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
            NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
            if (*pte != expected_pte) {
                UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
-                        *pte, expected_pte, offset, range_vec->start, range_vec->size);
+                               *pte,
+                               expected_pte,
+                               offset,
+                               range_vec->start,
+                               range_vec->size);
                return false;
            }
            offset += range_vec->page_size;
@@ -1226,7 +1252,11 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
    TEST_CHECK_RET(data.status == NV_OK);
    TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
    TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
-            range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
+                                                    range_vec->start,
+                                                    range_vec->size,
+                                                    range_vec->page_size,
+                                                    page_table_depth,
+                                                    membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));

    fake_tlb_invals_disable();
@@ -1249,7 +1279,11 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
    return NV_OK;
 }

-static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
+static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
+                                       NvU64 start,
+                                       NvU64 size,
+                                       NvU32 page_size,
+                                       uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@@ -1544,25 +1578,28 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
    uvm_mmu_mode_hal_t *hal;
+    uvm_page_directory_t dir;
    NvU32 i, j, big_page_size, page_size;

+    dir.depth = 0;
+
    for (i = 0; i < ARRAY_SIZE(big_page_sizes); i++) {
        big_page_size = big_page_sizes[i];
        hal = gpu->parent->arch_hal->mmu_mode_hal(big_page_size);

        memset(phys_allocs, 0, sizeof(phys_allocs));

-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x0L);

        phys_allocs[0] = &alloc_sys;
        phys_allocs[1] = &alloc_vid;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);

        phys_allocs[0] = &alloc_vid;
        phys_allocs[1] = &alloc_sys;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);

        for (j = 0; j <= 2; j++) {
@@ -1632,38 +1669,47 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
+    uvm_page_directory_t dir;
+
    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

+    dir.index_in_parent = 0;
+    dir.host_parent = NULL;
+    dir.depth = 0;
+
    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    dir.depth = 0;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@@ -1719,6 +1765,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
+    uvm_page_directory_t dir;

    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
@@ -1726,37 +1773,45 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

+    dir.index_in_parent = 0;
+    dir.host_parent = NULL;
+    dir.depth = 0;
+
    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    dir.depth = 0;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // NO_ATS PDE1 (depth 2)
    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 2);
+    dir.depth = 2;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
    else
@@ -1791,104 +1846,203 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent

 static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
+    NV_STATUS status = NV_OK;
    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
+    uvm_page_directory_t *dirs[5];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBB000LL);

-    // big versions have [11:8] set as well to test the page table merging
+    // Big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

-    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0);
+    memset(dirs, 0, sizeof(dirs));
+    // Fake directory tree.
+    for (i = 0; i < ARRAY_SIZE(dirs); i++) {
+        dirs[i] = uvm_kvmalloc_zero(sizeof(uvm_page_directory_t) + sizeof(dirs[i]->entries[0]) * 512);
+        TEST_CHECK_GOTO(dirs[i] != NULL, cleanup);
+
+        dirs[i]->depth = i;
+        dirs[i]->index_in_parent = 0;
+
+        if (i == 0)
+            dirs[i]->host_parent = NULL;
+        else
+            dirs[i]->host_parent = dirs[i - 1];
+    }
+
+    // Make sure cleared PDEs work as expected.
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0, cleanup);

    // Cleared PDEs work as expected for big and small PDEs.
    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0 && pde_bits[1] == 0, cleanup);

    // Sys and vidmem PDEs, uncached ATS allowed.
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBB00A, cleanup);

-    // Dual PDEs, uncached.
+    // Dual PDEs, uncached. We don't use child_dir in the depth 4 checks because
+    // our policy decides the PDE's PCF without using it.
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    if (g_uvm_global.ats.enabled)
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A, cleanup);
+    else
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999990C && pde_bits[1] == 0xBBBBBBB00A, cleanup);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    if (g_uvm_global.ats.enabled)
+        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C, cleanup);
+    else
+        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB0A && pde_bits[1] == 0x999999999900C, cleanup);
+
+    // We only need to test make_pde() on ATS when the CPU VA width < GPU's.
+    if (g_uvm_global.ats.enabled && uvm_cpu_num_va_bits() < hal->num_va_bits()) {
+        phys_allocs[0] = &alloc_sys;
+
+        dirs[1]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 1;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 1);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 2;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 511;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 511);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[1]->index_in_parent = 1;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 1);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 509;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 510;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        phys_allocs[0] = NULL;
+
+        dirs[1]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+
+        dirs[2]->index_in_parent = 2;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
+
+        dirs[1]->index_in_parent = 1;
+        dirs[2]->index_in_parent = 509;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
+
+        dirs[2]->index_in_parent = 510;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+    }

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
    // access counters disabled.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D,
+                    cleanup);

    // change to cached.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
-                   0x9999999999685);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
+                                  0x9999999999685,
+                    cleanup);

    // enable access counters.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605,
+                    cleanup);

    // remove atomic
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645,
+                    cleanup);

    // read only
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665,
+                    cleanup);

    // local video
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_VID,
-                                 0xBBBBBBB000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_VID,
+                                  0xBBBBBBB000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661,
+                    cleanup);

    // peer 1
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_1,
-                                 0xBBBBBBB000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_PEER_1,
+                                  0xBBBBBBB000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663,
+                    cleanup);

    // sparse
-    TEST_CHECK_RET(hal->make_sparse_pte() == 0x8);
+    TEST_CHECK_GOTO(hal->make_sparse_pte() == 0x8, cleanup);

    // sked reflected
-    TEST_CHECK_RET(hal->make_sked_reflected_pte() == 0xF09);
+    TEST_CHECK_GOTO(hal->make_sked_reflected_pte() == 0xF09, cleanup);

    num_page_sizes = get_page_sizes(gpu, page_sizes);

    for (i = 0; i < num_page_sizes; i++)
-        TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));
+        TEST_NV_CHECK_GOTO(entry_test_page_size(gpu, page_sizes[i]), cleanup);

-    return NV_OK;
+cleanup:
+    for (i = 0; i < ARRAY_SIZE(dirs); i++)
+        uvm_kvfree(dirs[i]);
+
+    return status;
 }

 static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
@@ -2303,7 +2457,8 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    gpu->parent = parent_gpu;

    // At least test_tlb_invalidates() relies on global state
-    // (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
+    // (g_tlb_invalidate_*) so make sure only one test instance can run at a
+    // time.
    uvm_mutex_lock(&g_uvm_global.global_lock);

    // Allocate the fake TLB tracking state. Notably tests still need to enable
@@ -2311,7 +2466,13 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    // calls.
    TEST_NV_CHECK_GOTO(fake_tlb_invals_alloc(), done);

-    TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
+    // We prevent the maxwell_test_page_tree test from running on ATS-enabled
+    // systems. On "fake" Maxwell-based ATS systems pde_fill() may push more
+    // methods than what we support in UVM. Specifically, on
+    // uvm_page_tree_init() which eventually calls phys_mem_init(). On Maxwell,
+    // upper PDE levels have more than 512 entries.
+    if (!g_uvm_global.ats.enabled)
+        TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(pascal_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(volta_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(ampere_test_page_tree(gpu), done);
--- a/kernel-open/nvidia-uvm/uvm_pascal.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2020 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -100,4 +100,6 @@ void uvm_hal_pascal_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = false;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2020 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -140,11 +140,18 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_pascal(void *entry,
+                            uvm_mmu_page_table_alloc_t **phys_allocs,
+                            uvm_page_directory_t *dir,
+                            NvU32 child_index)
 {
-    NvU32 entry_count = entries_per_index_pascal(depth);
+    NvU32 entry_count;
    NvU64 *entry_bits = (NvU64 *)entry;

+    UVM_ASSERT(dir);
+
+    entry_count = entries_per_index_pascal(dir->depth);
+
    if (entry_count == 1) {
        *entry_bits = single_pde_pascal(*phys_allocs);
    }
@@ -152,7 +159,8 @@ static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_alloc
        entry_bits[MMU_BIG] = big_half_pde_pascal(phys_allocs[MMU_BIG]);
        entry_bits[MMU_SMALL] = small_half_pde_pascal(phys_allocs[MMU_SMALL]);

-        // This entry applies to the whole dual PDE but is stored in the lower bits
+        // This entry applies to the whole dual PDE but is stored in the lower
+        // bits.
        entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER2, DUAL_PDE, IS_PDE, TRUE);
    }
    else {
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@@ -36,6 +36,7 @@
 #include "uvm_mmu.h"
 #include "uvm_gpu_access_counters.h"
 #include "uvm_pmm_sysmem.h"
+#include "uvm_migrate_pageable.h"

 static NV_STATUS uvm_test_get_gpu_ref_count(UVM_TEST_GET_GPU_REF_COUNT_PARAMS *params, struct file *filp)
 {
@@ -331,6 +332,7 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED, uvm_test_cgroup_accounting_supported);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SPLIT_INVALIDATE_DELAY, uvm_test_split_invalidate_delay);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CPU_CHUNK_API, uvm_test_cpu_chunk_api);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SKIP_MIGRATE_VMA, uvm_test_skip_migrate_vma);
    }

    return -EINVAL;
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@@ -28,6 +28,13 @@
 #include "uvm_ioctl.h"
 #include "nv_uvm_types.h"

+#define UVM_TEST_SKIP_MIGRATE_VMA                        UVM_TEST_IOCTL_BASE(103)
+typedef struct
+{
+    NvBool skip;                                         // In
+    NV_STATUS rmStatus;                                  // Out
+} UVM_TEST_SKIP_MIGRATE_VMA_PARAMS;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@@ -1082,25 +1082,19 @@ void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
 }


-void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
-                                     NvU32 batch_id,
-                                     uvm_fault_client_type_t client_type)
+void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type)
 {
    UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);

    if (!tools_is_event_enabled_in_any_va_space(UvmEventTypeGpuFaultReplay))
        return;

-    record_replay_event_helper(gpu->id,
-                               batch_id,
-                               client_type,
-                               NV_GETTIME(),
-                               gpu->parent->host_hal->get_time(gpu));
+    record_replay_event_helper(gpu->id, batch_id, client_type, NV_GETTIME(), gpu->parent->host_hal->get_time(gpu));
 }

 void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *buffer_entry,
-                                        bool on_managed)
+                                        bool on_managed_phys)
 {
    UvmEventEntry entry;
    UvmEventTestAccessCounterInfo *info = &entry.testEventData.accessCounter;
@@ -1119,6 +1113,7 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
    info->srcIndex            = uvm_id_value(gpu->id);
    info->address             = buffer_entry->address.address;
    info->isVirtual           = buffer_entry->address.is_virtual? 1: 0;
+
    if (buffer_entry->address.is_virtual) {
        info->instancePtr         = buffer_entry->virtual_info.instance_ptr.address;
        info->instancePtrAperture = g_hal_to_tools_aperture_table[buffer_entry->virtual_info.instance_ptr.aperture];
@@ -1126,9 +1121,10 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
    }
    else {
        info->aperture            = g_hal_to_tools_aperture_table[buffer_entry->address.aperture];
+        info->physOnManaged       = on_managed_phys? 1 : 0;
    }
+
    info->isFromCpu           = buffer_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MOMC? 1: 0;
-    info->onManaged           = on_managed? 1 : 0;
    info->value               = buffer_entry->counter_value;
    info->subGranularity      = buffer_entry->sub_granularity;
    info->bank                = buffer_entry->bank;
--- a/kernel-open/nvidia-uvm/uvm_tools.h
+++ b/kernel-open/nvidia-uvm/uvm_tools.h
@@ -102,18 +102,13 @@ void uvm_tools_record_read_duplicate_invalidate(uvm_va_block_t *va_block,
                                                uvm_va_block_region_t region,
                                                const uvm_page_mask_t *page_mask);

-void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
-                                uvm_push_t *push,
-                                NvU32 batch_id,
-                                uvm_fault_client_type_t client_type);
+void uvm_tools_broadcast_replay(uvm_gpu_t *gpu, uvm_push_t *push, NvU32 batch_id, uvm_fault_client_type_t client_type);

-void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
-                                     NvU32 batch_id,
-                                     uvm_fault_client_type_t client_type);
+void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type);

 void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *buffer_entry,
-                                        bool on_managed);
+                                        bool on_managed_phys);

 void uvm_tools_test_hmm_split_invalidate(uvm_va_space_t *va_space);

--- a/kernel-open/nvidia-uvm/uvm_turing.c
+++ b/kernel-open/nvidia-uvm/uvm_turing.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -93,4 +93,6 @@ void uvm_hal_turing_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_types.h
+++ b/kernel-open/nvidia-uvm/uvm_types.h
@@ -967,8 +967,10 @@ typedef struct
    NvU8 isFromCpu;

    NvU8 veId;
-    NvU8 onManaged;         // The access counter notification was triggered on
-                            // a managed memory region
+
+    // The physical access counter notification was triggered on a managed
+    // memory region. This is not set for virtual access counter notifications.
+    NvU8 physOnManaged;

    NvU32 value;
    NvU32 subGranularity;
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -1760,6 +1760,21 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
    return (NvU32)chunk_size;
 }

+NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+                                     uvm_processor_id_t processor,
+                                     uvm_page_index_t page_index)
+{
+    block_phys_page_t page;
+
+    UVM_ASSERT(block);
+
+    uvm_assert_mutex_locked(&block->lock);
+
+    page = block_phys_page(processor, page_index);
+
+    return block_phys_page_size(block, page);
+}
+
 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
 {
    uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
@@ -8248,14 +8263,6 @@ void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
    event_data.block_munmap.region = region;
    uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);

-    // Set a flag so that GPU fault events are flushed since they might refer
-    // to the region being unmapped.
-    // Note that holding the va_block lock prevents GPU VA spaces from
-    // being removed so the registered_gpu_va_spaces mask is stable.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
-
    // Release any remaining vidmem chunks in the given region.
    for_each_gpu_id(gpu_id) {
        uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
@@ -10155,6 +10162,34 @@ static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
        return preferred_location;

+    // Check if we should map the closest resident processor remotely on remote CPU fault
+    //
+    // When faulting on CPU, there's a linux process on behalf of it, which is associated
+    // with a unique VM pointed by current->mm. A block of memory residing on GPU is also
+    // associated with VM, pointed by va_block_context->mm. If they match, it's a regular
+    // (local) fault, and we may want to migrate a page from GPU to CPU.
+    // If it's a 'remote' fault, i.e. linux process differs from one associated with block
+    // VM, we might preserve residence.
+    //
+    // Establishing a remote fault without access counters means the memory could stay in
+    // the wrong spot for a long time, which is why we prefer to avoid creating remote
+    // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
+    // in place for NIC accesses.
+    //
+    // The logic that's used to detect remote faulting also keeps memory in place for
+    // ptrace accesses. We would prefer to control those policies separately, but the
+    // NIC case takes priority.
+    // If the accessing processor is CPU, we're either handling a fault
+    // from other than owning process, or we're handling an MOMC
+    // notification. Only prevent migration for the former.
+    if (UVM_ID_IS_CPU(processor_id) &&
+        operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&        
+        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
+        va_block_context->mm != current->mm) {
+        UVM_ASSERT(va_block_context->mm != NULL);
+        return closest_resident_processor;
+    }
+
    // If the page is resident on a processor other than the preferred location,
    // or the faulting processor can't access the preferred location, we select
    // the faulting processor as the new residency.
@@ -10713,7 +10748,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
                                                 uvm_va_block_context_t *va_block_context,
                                                 uvm_processor_id_t processor_id,
                                                 uvm_page_index_t page_index,
-                                                 uvm_fault_type_t access_type,
+                                                 uvm_fault_access_type_t access_type,
                                                 bool allow_migration)
 {
    uvm_va_range_t *va_range = va_block->va_range;
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -1000,7 +1000,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
                                                 uvm_va_block_context_t *va_block_context,
                                                 uvm_processor_id_t processor_id,
                                                 uvm_page_index_t page_index,
-                                                 uvm_fault_type_t access_type,
+                                                 uvm_fault_access_type_t access_type,
                                                 bool allow_migration);

 // API for access privilege revocation
@@ -2072,6 +2072,14 @@ void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
 // Locking: The va_block lock must be held.
 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region);

+// Get the size of the physical allocation backing the page at page_index on the
+// specified processor in the block. Returns 0 if the address is not resident on
+// the specified processor.
+// Locking: The va_block lock must be held.
+NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+                                     uvm_processor_id_t processor,
+                                     uvm_page_index_t page_index);
+
 // Get CPU page size or 0 if it is not mapped
 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
                                 uvm_page_index_t page_index);
--- a/kernel-open/nvidia-uvm/uvm_va_policy.h
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.h
@@ -193,7 +193,8 @@ uvm_va_policy_node_t *uvm_va_policy_node_iter_next(uvm_va_block_t *va_block, uvm
    for ((node) = uvm_va_policy_node_iter_first((va_block), (start), (end)),  \
         (next) = uvm_va_policy_node_iter_next((va_block), (node), (end));    \
         (node);                                                              \
-         (node) = (next))
+         (node) = (next),                                                     \
+         (next) = uvm_va_policy_node_iter_next((va_block), (node), (end)))

 // Returns the first policy in the range [start, end], if any.
 // Locking: The va_block lock must be held.
--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -1540,7 +1540,6 @@ static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
    atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);

    uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu_va_space->gpu->id);
-    uvm_processor_mask_clear_atomic(&va_space->needs_fault_buffer_flush, gpu_va_space->gpu->id);
    va_space->gpu_va_spaces[uvm_id_gpu_index(gpu_va_space->gpu->id)] = NULL;
    gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
 }
--- a/kernel-open/nvidia-uvm/uvm_va_space.h
+++ b/kernel-open/nvidia-uvm/uvm_va_space.h
@@ -253,17 +253,6 @@ struct uvm_va_space_struct
    // corrupting state.
    uvm_processor_mask_t gpu_unregister_in_progress;

-    // On VMA destruction, the fault buffer needs to be flushed for all the GPUs
-    // registered in the VA space to avoid leaving stale entries of the VA range
-    // that is going to be destroyed. Otherwise, these fault entries can be
-    // attributed to new VA ranges reallocated at the same addresses. However,
-    // uvm_vm_close is called with mm->mmap_lock taken and we cannot take the
-    // ISR lock. Therefore, we use a flag to notify the GPU fault handler that
-    // the fault buffer needs to be flushed, before servicing the faults that
-    // belong to the va_space. The bits are set and cleared atomically so no
-    // va_space lock is required.
-    uvm_processor_mask_t needs_fault_buffer_flush;
-
    // Mask of processors that are participating in system-wide atomics
    uvm_processor_mask_t system_wide_atomics_enabled_processors;

@@ -353,6 +342,7 @@ struct uvm_va_space_struct
    struct
    {
        bool  page_prefetch_enabled;
+        bool  skip_migrate_vma;

        atomic_t migrate_vma_allocation_fail_nth;

--- a/kernel-open/nvidia-uvm/uvm_va_space_mm.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space_mm.c
@@ -215,7 +215,13 @@ bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)

    static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
    {
+#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
        .invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
+#elif defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
+        .arch_invalidate_secondary_tlbs = uvm_mmu_notifier_invalidate_range_ats,
+#else
+        #error One of invalidate_range/arch_invalid_secondary must be present
+#endif
    };

    static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)
--- a/kernel-open/nvidia-uvm/uvm_volta.c
+++ b/kernel-open/nvidia-uvm/uvm_volta.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -98,4 +98,6 @@ void uvm_hal_volta_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = false;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_volta_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -145,13 +145,20 @@ static NvU64 small_half_pde_volta(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_volta(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_volta(void *entry,
+                           uvm_mmu_page_table_alloc_t **phys_allocs,
+                           uvm_page_directory_t *dir,
+                           NvU32 child_index)
 {
-    NvU32 entry_count = entries_per_index_volta(depth);
+    NvU32 entry_count;
    NvU64 *entry_bits = (NvU64 *)entry;

+    UVM_ASSERT(dir);
+
+    entry_count = entries_per_index_volta(dir->depth);
+
    if (entry_count == 1) {
-        *entry_bits = single_pde_volta(*phys_allocs, depth);
+        *entry_bits = single_pde_volta(*phys_allocs, dir->depth);
    }
    else if (entry_count == 2) {
        entry_bits[MMU_BIG] = big_half_pde_volta(phys_allocs[MMU_BIG]);
--- a/kernel-open/nvidia/libspdm_shash.c
+++ b/kernel-open/nvidia/libspdm_shash.c
@@ -23,10 +23,16 @@

 #include "internal_crypt_lib.h"

+#ifdef USE_LKCA
+#ifndef NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT
+#include <crypto/internal/hash.h>
+#endif
+#endif
+
 void *lkca_hash_new(const char* alg_name)
 {
 #ifndef USE_LKCA
-    return false;
+    return NULL;
 #else
    //XXX: can we reuse crypto_shash part and just allocate desc
    struct crypto_shash *alg;
@@ -87,9 +93,24 @@ bool lkca_hmac_duplicate(struct shash_desc *dst, struct shash_desc const *src)

        struct crypto_shash *src_tfm = src->tfm;
        struct crypto_shash *dst_tfm = dst->tfm;
+        int ss = crypto_shash_statesize(dst_tfm);
+
+#ifdef NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT
        char *src_ipad = crypto_tfm_ctx_aligned(&src_tfm->base);
        char *dst_ipad = crypto_tfm_ctx_aligned(&dst_tfm->base);
-        int ss = crypto_shash_statesize(dst_tfm);
+#else
+        int ctx_size = crypto_shash_alg(dst_tfm)->base.cra_ctxsize;
+        char *src_ipad = crypto_shash_ctx(src_tfm);
+        char *dst_ipad = crypto_shash_ctx(dst_tfm);
+        /*
+         * Actual struct definition is hidden, so I assume data we need is at
+         * the end. In 6.0 the struct has a pointer to crpyto_shash followed by: 
+         * 'u8 ipad[statesize];', then 'u8 opad[statesize];'
+         */
+        src_ipad += ctx_size - 2 * ss;
+        dst_ipad += ctx_size - 2 * ss;
+#endif
+
        memcpy(dst_ipad, src_ipad, crypto_shash_blocksize(src->tfm));
        memcpy(dst_ipad + ss, src_ipad + ss, crypto_shash_blocksize(src->tfm));
        crypto_shash_clear_flags(dst->tfm, CRYPTO_TFM_NEED_KEY);
--- a/kernel-open/nvidia/nv-msi.c
+++ b/kernel-open/nvidia/nv-msi.c
@@ -156,7 +156,7 @@ NvS32 NV_API_CALL nv_request_msix_irq(nv_linux_state_t *nvl)
        {
            for( j = 0; j < i; j++)
            {
-                free_irq(nvl->msix_entries[i].vector, (void *)nvl);
+                free_irq(nvl->msix_entries[j].vector, (void *)nvl);
            }
            break;
        }
--- a/kernel-open/nvidia/nv-p2p.c
+++ b/kernel-open/nvidia/nv-p2p.c
@@ -316,14 +316,14 @@ int nvidia_p2p_init_mapping(
    return -ENOTSUPP;
 }

-EXPORT_SYMBOL(nvidia_p2p_init_mapping);
+NV_EXPORT_SYMBOL(nvidia_p2p_init_mapping);

 int nvidia_p2p_destroy_mapping(uint64_t p2p_token)
 {
    return -ENOTSUPP;
 }

-EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);
+NV_EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);

 static void nv_p2p_mem_info_free_callback(void *data)
 {
@@ -506,8 +506,13 @@ static int nv_p2p_get_pages(
    (*page_table)->page_size = page_size_index;

    os_free_mem(physical_addresses);
+    physical_addresses = NULL;
+
    os_free_mem(wreqmb_h);
+    wreqmb_h = NULL;
+
    os_free_mem(rreqmb_h);
+    rreqmb_h = NULL;

    if (free_callback != NULL)
    {
@@ -582,7 +587,7 @@ int nvidia_p2p_get_pages(
                            p2p_token, va_space, virtual_address,
                            length, page_table, free_callback, data);
 }
-EXPORT_SYMBOL(nvidia_p2p_get_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_get_pages);

 int nvidia_p2p_get_pages_persistent(
    uint64_t virtual_address,
@@ -600,7 +605,7 @@ int nvidia_p2p_get_pages_persistent(
                            virtual_address, length, page_table,
                            NULL, NULL);
 }
-EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
+NV_EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);

 /*
 * This function is a no-op, but is left in place (for now), in order to allow
@@ -613,7 +618,7 @@ int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table)
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_free_page_table);
+NV_EXPORT_SYMBOL(nvidia_p2p_free_page_table);

 int nvidia_p2p_put_pages(
    uint64_t p2p_token,
@@ -645,7 +650,7 @@ int nvidia_p2p_put_pages(

    return nvidia_p2p_map_status(status);
 }
-EXPORT_SYMBOL(nvidia_p2p_put_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_put_pages);

 int nvidia_p2p_put_pages_persistent(
    uint64_t virtual_address,
@@ -685,7 +690,7 @@ int nvidia_p2p_put_pages_persistent(

    return nvidia_p2p_map_status(status);
 }
-EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);
+NV_EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);

 int nvidia_p2p_dma_map_pages(
    struct pci_dev *peer,
@@ -800,7 +805,7 @@ failed:
    return nvidia_p2p_map_status(status);
 }

-EXPORT_SYMBOL(nvidia_p2p_dma_map_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_dma_map_pages);

 int nvidia_p2p_dma_unmap_pages(
    struct pci_dev *peer,
@@ -840,7 +845,7 @@ int nvidia_p2p_dma_unmap_pages(
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_dma_unmap_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_dma_unmap_pages);

 /*
 * This function is a no-op, but is left in place (for now), in order to allow
@@ -855,7 +860,7 @@ int nvidia_p2p_free_dma_mapping(
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_free_dma_mapping);
+NV_EXPORT_SYMBOL(nvidia_p2p_free_dma_mapping);

 int nvidia_p2p_register_rsync_driver(
    nvidia_p2p_rsync_driver_t *driver,
@@ -884,7 +889,7 @@ int nvidia_p2p_register_rsync_driver(
                                    driver->wait_for_rsync, data);
 }

-EXPORT_SYMBOL(nvidia_p2p_register_rsync_driver);
+NV_EXPORT_SYMBOL(nvidia_p2p_register_rsync_driver);

 void nvidia_p2p_unregister_rsync_driver(
    nvidia_p2p_rsync_driver_t *driver,
@@ -916,7 +921,7 @@ void nvidia_p2p_unregister_rsync_driver(
                               driver->wait_for_rsync, data);
 }

-EXPORT_SYMBOL(nvidia_p2p_unregister_rsync_driver);
+NV_EXPORT_SYMBOL(nvidia_p2p_unregister_rsync_driver);

 int nvidia_p2p_get_rsync_registers(
    nvidia_p2p_rsync_reg_info_t **reg_info
@@ -1009,7 +1014,7 @@ int nvidia_p2p_get_rsync_registers(
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_get_rsync_registers);
+NV_EXPORT_SYMBOL(nvidia_p2p_get_rsync_registers);

 void nvidia_p2p_put_rsync_registers(
    nvidia_p2p_rsync_reg_info_t *reg_info
@@ -1041,4 +1046,4 @@ void nvidia_p2p_put_rsync_registers(
    os_free_mem(reg_info);
 }

-EXPORT_SYMBOL(nvidia_p2p_put_rsync_registers);
+NV_EXPORT_SYMBOL(nvidia_p2p_put_rsync_registers);
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -1224,12 +1224,11 @@ static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)
            rm_read_registry_dword(sp, nv, NV_REG_ENABLE_MSI, &msi_config);
            if (msi_config == 1)
            {
-                if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSIX))
+                if (nvl->pci_dev->msix_cap && rm_is_msix_allowed(sp, nv))
                {
                    nv_init_msix(nv);
                }
-                if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSI) &&
-                    !(nv->flags & NV_FLAG_USES_MSIX))
+                if (nvl->pci_dev->msi_cap && !(nv->flags & NV_FLAG_USES_MSIX))
                {
                    nv_init_msi(nv);
                }
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@@ -195,6 +195,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += devm_clk_bulk_get_all
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_task_ioprio
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mdev_set_iommu_device
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += offline_and_remove_memory
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += crypto_tfm_ctx_aligned

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_of_node_to_nid
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_sme_active
@@ -215,6 +216,7 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_get_dram_num_channe
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_dram_types
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_pxm_to_node
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_screen_info
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_screen_info
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_i2c_bus_status
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_fuse_control_read
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_get_platform
--- a/kernel-open/nvidia/nvlink_export.h
+++ b/kernel-open/nvidia/nvlink_export.h
@@ -46,6 +46,11 @@ NvlStatus nvlink_lib_unload(void);
 */
 NvlStatus nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params *ctrl_params);

+/*
+* Gets number of devices with type deviceType
+*/
+NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
+
 #ifdef __cplusplus
 }
 #endif
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -28,6 +28,11 @@

 #include "nv-time.h"

+#include <linux/mmzone.h>
+#include <linux/numa.h>
+
+#include <linux/pid.h>
+
 extern char *NVreg_TemporaryFilePath;

 #define MAX_ERROR_STRING 512
@@ -1242,9 +1247,12 @@ void NV_API_CALL os_get_screen_info(
     * SYSFB_SIMPLEFB registers a dummy framebuffer which does not contain the
     * information required by os_get_screen_info(), therefore you need to
     * fall back onto the screen_info structure.
+     *
+     * After commit b8466fe82b79 ("efi: move screen_info into efi init code")
+     * in v6.7, 'screen_info' is exported as GPL licensed symbol for ARM64.
     */

-#if NV_IS_EXPORT_SYMBOL_PRESENT_screen_info
+#if NV_CHECK_EXPORT_SYMBOL(screen_info)
    /*
     * If there is not a framebuffer console, return 0 size.
     *
@@ -2122,6 +2130,43 @@ void NV_API_CALL os_nv_cap_close_fd
    nv_cap_close_fd(fd);
 }

+/*
+ * Reads the total memory and free memory of a NUMA node from the kernel.
+ */
+NV_STATUS NV_API_CALL os_get_numa_node_memory_usage
+(
+    NvS32 node_id,
+    NvU64 *free_memory_bytes,
+    NvU64 *total_memory_bytes
+)
+{
+    struct pglist_data *pgdat;
+    struct zone *zone;
+    NvU32 zone_id;
+
+    if (node_id >= MAX_NUMNODES)
+    {
+        nv_printf(NV_DBG_ERRORS, "Invalid NUMA node ID\n");
+        return NV_ERR_INVALID_ARGUMENT;
+    }
+
+    pgdat = NODE_DATA(node_id);
+
+    *free_memory_bytes = 0;
+    *total_memory_bytes = 0;
+
+    for (zone_id = 0; zone_id < MAX_NR_ZONES; zone_id++)
+    {
+        zone = &(pgdat->node_zones[zone_id]);
+        if (!populated_zone(zone))
+            continue;
+        *free_memory_bytes += (zone_page_state_snapshot(zone, NR_FREE_PAGES) * PAGE_SIZE);
+        *total_memory_bytes += (zone->present_pages * PAGE_SIZE);
+    }
+
+    return NV_OK;
+}
+
 typedef struct os_numa_gpu_mem_hotplug_notifier_s
 {
    NvU64 start_pa;
@@ -2373,3 +2418,28 @@ NV_STATUS NV_API_CALL os_offline_page_at_address
 #endif
 }

+void* NV_API_CALL os_get_pid_info(void)
+{
+    return get_task_pid(current, PIDTYPE_PID);
+}
+
+void NV_API_CALL os_put_pid_info(void *pid_info)
+{
+    if (pid_info != NULL)
+        put_pid(pid_info);
+}
+
+NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid)
+{
+    if ((pid_info == NULL) || (ns_pid == NULL))
+        return NV_ERR_INVALID_ARGUMENT;
+
+    *ns_pid = pid_vnr((struct pid *)pid_info);
+
+    // The call returns 0 if the PID is not found in the current ns
+    if (*ns_pid == 0)
+        return NV_ERR_OBJECT_NOT_FOUND;
+
+    return NV_OK;
+}
+