545.23.06

2026-02-04 23:29:58 +00:00 · 2023-10-17 09:25:29 -07:00
parent f59818b751
commit b5bf85a8e3
917 changed files with 132480 additions and 110015 deletions
--- a/kernel-open/nvidia-uvm/clc365.h
+++ b/kernel-open/nvidia-uvm/clc365.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2022 NVIDIA Corporation
+    Copyright (c) 2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/clc369.h
+++ b/kernel-open/nvidia-uvm/clc369.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2022 NVIDIA Corporation
+    Copyright (c) 2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@@ -247,6 +247,11 @@ int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferr
    return 0;
 }

+int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname)
+{
+    return nv_kthread_q_init_on_node(q, qname, NV_KTHREAD_NO_NODE);
+}
+
 // Returns true (non-zero) if the item was actually scheduled, and false if the
 // item was already pending in a queue.
 static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item)
--- a/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
@@ -27,6 +27,7 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rm_mem.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_channel.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_lock.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hal.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_processors.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_tree.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rb_tree.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_allocator.c
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@@ -82,10 +82,12 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += vm_fault_to_errno
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += find_next_bit_wrap

 NV_CONFTEST_TYPE_COMPILE_TESTS += backing_dev_info
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_context_t
@@ -99,6 +101,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
 NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
+NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
 NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
@@ -113,4 +116,3 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
-NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup
--- a/kernel-open/nvidia-uvm/nvstatus.c
+++ b/kernel-open/nvidia-uvm/nvstatus.c
@@ -24,11 +24,11 @@
 #include "nvstatus.h"

 #if !defined(NV_PRINTF_STRING_SECTION)
-#if defined(NVRM) && NVCPU_IS_RISCV64
+#if defined(NVRM) && NVOS_IS_LIBOS
 #define NV_PRINTF_STRING_SECTION         __attribute__ ((section (".logging")))
-#else // defined(NVRM) && NVCPU_IS_RISCV64
+#else // defined(NVRM) && NVOS_IS_LIBOS
 #define NV_PRINTF_STRING_SECTION
-#endif // defined(NVRM) && NVCPU_IS_RISCV64
+#endif // defined(NVRM) && NVOS_IS_LIBOS
 #endif // !defined(NV_PRINTF_STRING_SECTION)

 /*
--- a/kernel-open/nvidia-uvm/uvm.c
+++ b/kernel-open/nvidia-uvm/uvm.c
@@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
 static void uvm_vm_close_managed(struct vm_area_struct *vma)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_processor_id_t gpu_id;
    bool make_zombie = false;

    if (current->mm != NULL)
@@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)

    uvm_destroy_vma_managed(vma, make_zombie);

-    // Notify GPU address spaces that the fault buffer needs to be flushed to
-    // avoid finding stale entries that can be attributed to new VA ranges
-    // reallocated at the same address.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
    uvm_va_space_up_write(va_space);

    if (current->mm != NULL)
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@@ -216,6 +216,10 @@ NV_STATUS UvmDeinitialize(void);
 // Note that it is not required to release VA ranges that were reserved with
 // UvmReserveVa().
 //
+// This is useful for per-process checkpoint and restore, where kernel-mode
+// state needs to be reconfigured to match the expectations of a pre-existing
+// user-mode process.
+//
 // UvmReopen() closes the open file returned by UvmGetFileDescriptor() and
 // replaces it with a new open file with the same name.
 //
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -114,6 +114,8 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
 {
    uvm_ats_fault_invalidate_t *ats_invalidate;

+    uvm_ats_smmu_invalidate_tlbs(gpu_va_space, addr, size);
+
    if (client_type == UVM_FAULT_CLIENT_TYPE_GPC)
        ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.replayable.ats_invalidate;
    else
@@ -588,4 +590,3 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,

    return status;
 }
-
--- a/kernel-open/nvidia-uvm/uvm_ats_sva.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.c
@@ -29,8 +29,12 @@
 #include "uvm_va_space.h"
 #include "uvm_va_space_mm.h"

+#include <asm/io.h>
 #include <linux/iommu.h>
 #include <linux/mm_types.h>
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/mmu_context.h>

 // linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
 // reference required for the iommu_sva_bind_device() call. This header is not
@@ -46,17 +50,271 @@
 #define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
 #endif

+// Base address of SMMU CMDQ-V for GSMMU0.
+#define SMMU_CMDQV_BASE_ADDR(smmu_base) (smmu_base + 0x200000)
+#define SMMU_CMDQV_BASE_LEN 0x00830000
+
+// CMDQV configuration is done by firmware but we check status here.
+#define SMMU_CMDQV_CONFIG 0x0
+#define SMMU_CMDQV_CONFIG_CMDQV_EN BIT(0)
+
+// Used to map a particular VCMDQ to a VINTF.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP(vcmdq_id) (0x200 + 0x4 * (vcmdq_id))
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC BIT(0)
+
+// Shift for the field containing the index of the virtual interface
+// owning the VCMDQ.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT 15
+
+// Base address for the VINTF registers.
+#define SMMU_VINTF_BASE_ADDR(cmdqv_base_addr, vintf_id) (cmdqv_base_addr + 0x1000 + 0x100 * (vintf_id))
+
+// Virtual interface (VINTF) configuration registers. The WAR only
+// works on baremetal so we need to configure ourselves as the
+// hypervisor owner.
+#define SMMU_VINTF_CONFIG 0x0
+#define SMMU_VINTF_CONFIG_ENABLE BIT(0)
+#define SMMU_VINTF_CONFIG_HYP_OWN BIT(17)
+
+#define SMMU_VINTF_STATUS 0x0
+#define SMMU_VINTF_STATUS_ENABLED BIT(0)
+
+// Caclulates the base address for a particular VCMDQ instance.
+#define SMMU_VCMDQ_BASE_ADDR(cmdqv_base_addr, vcmdq_id) (cmdqv_base_addr + 0x10000 + 0x80 * (vcmdq_id))
+
+// SMMU command queue consumer index register. Updated by SMMU
+// when commands are consumed.
+#define SMMU_VCMDQ_CONS 0x0
+
+// SMMU command queue producer index register. Updated by UVM when
+// commands are added to the queue.
+#define SMMU_VCMDQ_PROD 0x4
+
+// Configuration register used to enable a VCMDQ.
+#define SMMU_VCMDQ_CONFIG 0x8
+#define SMMU_VCMDQ_CONFIG_ENABLE BIT(0)
+
+// Status register used to check the VCMDQ is enabled.
+#define SMMU_VCMDQ_STATUS 0xc
+#define SMMU_VCMDQ_STATUS_ENABLED BIT(0)
+
+// Base address offset for the VCMDQ registers.
+#define SMMU_VCMDQ_CMDQ_BASE 0x10000
+
+// Size of the command queue. Each command is 8 bytes and we can't
+// have a command queue greater than one page.
+#define SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE 9
+#define SMMU_VCMDQ_CMDQ_ENTRIES (1UL << SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE)
+
+// We always use VINTF63 for the WAR
+#define VINTF 63
+static void smmu_vintf_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+// We always use VCMDQ127 for the WAR
+#define VCMDQ 127
+void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+static void smmu_vcmdq_write64(void __iomem *smmu_cmdqv_base, int reg, NvU64 val)
+{
+    iowrite64(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+// TLB invalidates on read-only to read-write upgrades
+static NV_STATUS uvm_ats_smmu_war_init(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_spin_loop_t spin;
+    NV_STATUS status;
+    unsigned long cmdqv_config;
+    void __iomem *smmu_cmdqv_base;
+    struct acpi_iort_node *node;
+    struct acpi_iort_smmu_v3 *iort_smmu;
+
+    node = *(struct acpi_iort_node **) dev_get_platdata(parent_gpu->pci_dev->dev.iommu->iommu_dev->dev->parent);
+    iort_smmu = (struct acpi_iort_smmu_v3 *) node->node_data;
+
+    smmu_cmdqv_base = ioremap(SMMU_CMDQV_BASE_ADDR(iort_smmu->base_address), SMMU_CMDQV_BASE_LEN);
+    if (!smmu_cmdqv_base)
+        return NV_ERR_NO_MEMORY;
+
+    parent_gpu->smmu_war.smmu_cmdqv_base = smmu_cmdqv_base;
+    cmdqv_config = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CONFIG);
+    if (!(cmdqv_config & SMMU_CMDQV_CONFIG_CMDQV_EN)) {
+        status = NV_ERR_OBJECT_NOT_FOUND;
+        goto out;
+    }
+
+    // Allocate SMMU CMDQ pages for WAR
+    parent_gpu->smmu_war.smmu_cmdq = alloc_page(NV_UVM_GFP_FLAGS | __GFP_ZERO);
+    if (!parent_gpu->smmu_war.smmu_cmdq) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    // Initialise VINTF for the WAR
+    smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, SMMU_VINTF_CONFIG_ENABLE | SMMU_VINTF_CONFIG_HYP_OWN);
+    UVM_SPIN_WHILE(!(smmu_vintf_read32(smmu_cmdqv_base, SMMU_VINTF_STATUS) & SMMU_VINTF_STATUS_ENABLED), &spin);
+
+    // Allocate VCMDQ to VINTF
+    iowrite32((VINTF << SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT) | SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC,
+              smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+
+    BUILD_BUG_ON((SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 3) > PAGE_SHIFT);
+    smmu_vcmdq_write64(smmu_cmdqv_base, SMMU_VCMDQ_CMDQ_BASE,
+                       page_to_phys(parent_gpu->smmu_war.smmu_cmdq) | SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONS, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_PROD, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, SMMU_VCMDQ_CONFIG_ENABLE);
+    UVM_SPIN_WHILE(!(smmu_vcmdq_read32(smmu_cmdqv_base, SMMU_VCMDQ_STATUS) & SMMU_VCMDQ_STATUS_ENABLED), &spin);
+
+    uvm_mutex_init(&parent_gpu->smmu_war.smmu_lock, UVM_LOCK_ORDER_LEAF);
+    parent_gpu->smmu_war.smmu_prod = 0;
+    parent_gpu->smmu_war.smmu_cons = 0;
+
+    return NV_OK;
+
+out:
+    iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+    parent_gpu->smmu_war.smmu_cmdqv_base = NULL;
+
+    return status;
+}
+
+static void uvm_ats_smmu_war_deinit(uvm_parent_gpu_t *parent_gpu)
+{
+    void __iomem *smmu_cmdqv_base = parent_gpu->smmu_war.smmu_cmdqv_base;
+    NvU32 cmdq_alloc_map;
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base) {
+        smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, 0);
+        cmdq_alloc_map = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        iowrite32(cmdq_alloc_map & SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC, smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, 0);
+    }
+
+    if (parent_gpu->smmu_war.smmu_cmdq)
+        __free_page(parent_gpu->smmu_war.smmu_cmdq);
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base)
+        iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+}
+
+// The SMMU on ARM64 can run under different translation regimes depending on
+// what features the OS and CPU variant support. The CPU for GH180 supports
+// virtualisation extensions and starts the kernel at EL2 meaning SMMU operates
+// under the NS-EL2-E2H translation regime. Therefore we need to use the
+// TLBI_EL2_* commands which invalidate TLB entries created under this
+// translation regime.
+#define CMDQ_OP_TLBI_EL2_ASID 0x21;
+#define CMDQ_OP_TLBI_EL2_VA 0x22;
+#define CMDQ_OP_CMD_SYNC 0x46
+
+// Use the same maximum as used for MAX_TLBI_OPS in the upstream
+// kernel.
+#define UVM_MAX_TLBI_OPS (1UL << (PAGE_SHIFT - 3))
+
+#if UVM_ATS_SMMU_WAR_REQUIRED()
+void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+{
+    struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
+    uvm_parent_gpu_t *parent_gpu = gpu_va_space->gpu->parent;
+    struct {
+        NvU64 low;
+        NvU64 high;
+    } *vcmdq;
+    unsigned long vcmdq_prod;
+    NvU64 end;
+    uvm_spin_loop_t spin;
+    NvU16 asid;
+
+    if (!parent_gpu->smmu_war.smmu_cmdqv_base)
+        return;
+
+    asid = arm64_mm_context_get(mm);
+    vcmdq = kmap(parent_gpu->smmu_war.smmu_cmdq);
+    uvm_mutex_lock(&parent_gpu->smmu_war.smmu_lock);
+    vcmdq_prod = parent_gpu->smmu_war.smmu_prod;
+
+    // Our queue management is very simple. The mutex prevents multiple
+    // producers writing to the queue and all our commands require waiting for
+    // the queue to drain so we know it's empty. If we can't fit enough commands
+    // in the queue we just invalidate the whole ASID.
+    //
+    // The command queue is a cirular buffer with the MSB representing a wrap
+    // bit that must toggle on each wrap. See the SMMU architecture
+    // specification for more details.
+    //
+    // SMMU_VCMDQ_CMDQ_ENTRIES - 1 because we need to leave space for the
+    // CMD_SYNC.
+    if ((size >> PAGE_SHIFT) > min(UVM_MAX_TLBI_OPS, SMMU_VCMDQ_CMDQ_ENTRIES - 1)) {
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_ASID;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0;
+        vcmdq_prod++;
+    }
+    else {
+        for (end = addr + size; addr < end; addr += PAGE_SIZE) {
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_VA;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = addr & ~((1UL << 12) - 1);
+            vcmdq_prod++;
+        }
+    }
+
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_CMD_SYNC;
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0x0;
+    vcmdq_prod++;
+
+    // MSB is the wrap bit
+    vcmdq_prod &= (1UL << (SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 1)) - 1;
+    parent_gpu->smmu_war.smmu_prod = vcmdq_prod;
+    smmu_vcmdq_write32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_PROD, parent_gpu->smmu_war.smmu_prod);
+
+    UVM_SPIN_WHILE(
+        (smmu_vcmdq_read32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_CONS) & GENMASK(19, 0)) != vcmdq_prod,
+        &spin);
+
+    uvm_mutex_unlock(&parent_gpu->smmu_war.smmu_lock);
+    kunmap(parent_gpu->smmu_war.smmu_cmdq);
+    arm64_mm_context_put(mm);
+}
+#endif
+
 NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
 {
    int ret;

    ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
+    if (ret)
+        return errno_to_nv_status(ret);

-    return errno_to_nv_status(ret);
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        return uvm_ats_smmu_war_init(parent_gpu);
+    else
+        return NV_OK;
 }

 void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
 {
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        uvm_ats_smmu_war_deinit(parent_gpu);
+
    iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
 }

--- a/kernel-open/nvidia-uvm/uvm_ats_sva.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.h
@@ -53,6 +53,17 @@
        #define UVM_ATS_SVA_SUPPORTED() 0
    #endif

+// If NV_ARCH_INVALIDATE_SECONDARY_TLBS is defined it means the upstream fix is
+// in place so no need for the WAR from Bug 4130089: [GH180][r535] WAR for
+// kernel not issuing SMMU TLB invalidates on read-only
+#if defined(NV_ARCH_INVALIDATE_SECONDARY_TLBS)
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#elif NVCPU_IS_AARCH64
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 1
+#else
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#endif
+
 typedef struct
 {
    int placeholder;
@@ -81,6 +92,17 @@ typedef struct

    // LOCKING: None
    void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
+
+    // Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+    // TLB invalidates on read-only to read-write upgrades
+    #if UVM_ATS_SMMU_WAR_REQUIRED()
+        void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size);
+    #else
+        static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+        {
+
+        }
+    #endif
 #else
    static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
    {
@@ -111,6 +133,11 @@ typedef struct
    {

    }
+
+    static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+    {
+
+    }
 #endif // UVM_ATS_SVA_SUPPORTED

 #endif // __UVM_ATS_SVA_H__
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -2683,7 +2683,7 @@ static void init_channel_manager_conf(uvm_channel_manager_t *manager)
    // caches vidmem (and sysmem), we place GPFIFO and GPPUT on sysmem to avoid
    // cache thrash. The memory access latency is reduced, despite the required
    // access through the bus, because no cache coherence message is exchanged.
-    if (uvm_gpu_is_coherent(gpu->parent)) {
+    if (uvm_parent_gpu_is_coherent(gpu->parent)) {
        manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_SYS;

        // On GPUs with limited ESCHED addressing range, e.g., Volta on P9, RM
--- a/kernel-open/nvidia-uvm/uvm_common.c
+++ b/kernel-open/nvidia-uvm/uvm_common.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2021 NVIDIA Corporation
+    Copyright (c) 2013-2023 NVIDIA Corporation

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
@@ -233,18 +233,6 @@ unsigned uvm_get_stale_thread_id(void)
    return (unsigned)task_pid_vnr(current);
 }

-//
-// A simple security rule for allowing access to UVM user space memory: if you
-// are the same user as the owner of the memory, or if you are root, then you
-// are granted access. The idea is to allow debuggers and profilers to work, but
-// without opening up any security holes.
-//
-NvBool uvm_user_id_security_check(uid_t euidTarget)
-{
-    return (NV_CURRENT_EUID() == euidTarget) ||
-           (UVM_ROOT_UID == euidTarget);
-}
-
 void on_uvm_test_fail(void)
 {
    (void)NULL;
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2021 NVIDIA Corporation
+    Copyright (c) 2013-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -282,9 +282,6 @@ static inline void kmem_cache_destroy_safe(struct kmem_cache **ppCache)
    }
 }

-static const uid_t UVM_ROOT_UID = 0;
-
-
 typedef struct
 {
    NvU64 start_time_ns;
@@ -335,7 +332,6 @@ NV_STATUS errno_to_nv_status(int errnoCode);
 int nv_status_to_errno(NV_STATUS status);
 unsigned uvm_get_stale_process_id(void);
 unsigned uvm_get_stale_thread_id(void);
-NvBool uvm_user_id_security_check(uid_t euidTarget);

 extern int uvm_enable_builtin_tests;

--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021-2023 NVIDIA Corporation
+    Copyright (c) 2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -54,26 +54,23 @@ bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu)
    return uvm_conf_computing_get_mode(gpu->parent) == UVM_GPU_CONF_COMPUTE_MODE_HCC;
 }

-void uvm_conf_computing_check_parent_gpu(const uvm_parent_gpu_t *parent)
+NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent)
 {
-    uvm_gpu_t *first_gpu;
+    UvmGpuConfComputeMode cc, sys_cc;
+    uvm_gpu_t *first;

    uvm_assert_mutex_locked(&g_uvm_global.global_lock);

-    // The Confidential Computing state of the GPU should match that of the
-    // system.
-    UVM_ASSERT(uvm_conf_computing_mode_enabled_parent(parent) == g_uvm_global.conf_computing_enabled);
-
    // TODO: Bug 2844714: since we have no routine to traverse parent GPUs,
    // find first child GPU and get its parent.
-    first_gpu = uvm_global_processor_mask_find_first_gpu(&g_uvm_global.retained_gpus);
-    if (first_gpu == NULL)
-        return;
+    first = uvm_global_processor_mask_find_first_gpu(&g_uvm_global.retained_gpus);
+    if (!first)
+        return NV_OK;

-    // All GPUs derive Confidential Computing status from their parent. By
-    // current policy all parent GPUs have identical Confidential Computing
-    // status.
-    UVM_ASSERT(uvm_conf_computing_get_mode(parent) == uvm_conf_computing_get_mode(first_gpu->parent));
+    sys_cc = uvm_conf_computing_get_mode(first->parent);
+    cc = uvm_conf_computing_get_mode(parent);
+
+    return cc == sys_cc ? NV_OK : NV_ERR_NOT_SUPPORTED;
 }

 static void dma_buffer_destroy_locked(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@@ -60,8 +60,10 @@
 // UVM_METHOD_SIZE * 2 * 10 = 80.
 #define UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE 80

-void uvm_conf_computing_check_parent_gpu(const uvm_parent_gpu_t *parent);
-
+// All GPUs derive confidential computing status from their parent.
+// By current policy all parent GPUs have identical confidential
+// computing status.
+NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent);
 bool uvm_conf_computing_mode_enabled_parent(const uvm_parent_gpu_t *parent);
 bool uvm_conf_computing_mode_enabled(const uvm_gpu_t *gpu);
 bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu);
--- a/kernel-open/nvidia-uvm/uvm_global.c
+++ b/kernel-open/nvidia-uvm/uvm_global.c
@@ -71,6 +71,11 @@ static void uvm_unregister_callbacks(void)
    }
 }

+static void sev_init(const UvmPlatformInfo *platform_info)
+{
+    g_uvm_global.sev_enabled = platform_info->sevEnabled;
+}
+
 NV_STATUS uvm_global_init(void)
 {
    NV_STATUS status;
@@ -119,7 +124,8 @@ NV_STATUS uvm_global_init(void)

    uvm_ats_init(&platform_info);
    g_uvm_global.num_simulated_devices = 0;
-    g_uvm_global.conf_computing_enabled = platform_info.confComputingEnabled;
+
+    sev_init(&platform_info);

    status = uvm_gpu_init();
    if (status != NV_OK) {
--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -143,16 +143,11 @@ struct uvm_global_struct
        struct page *page;
    } unload_state;

-    // True if the VM has AMD's SEV, or equivalent HW security extensions such
-    // as Intel's TDX, enabled. The flag is always false on the host.
-    //
-    // This value moves in tandem with that of Confidential Computing in the
-    // GPU(s) in all supported configurations, so it is used as a proxy for the
-    // Confidential Computing state.
-    //
-    // This field is set once during global initialization (uvm_global_init),
-    // and can be read afterwards without acquiring any locks.
-    bool conf_computing_enabled;
+    // AMD Secure Encrypted Virtualization (SEV) status. True if VM has SEV
+    // enabled. This field is set once during global initialization
+    // (uvm_global_init), and can be read afterwards without acquiring any
+    // locks.
+    bool sev_enabled;
 };

 // Initialize global uvm state
@@ -238,10 +233,8 @@ static uvm_gpu_t *uvm_gpu_get_by_processor_id(uvm_processor_id_t id)
    return gpu;
 }

-static uvmGpuSessionHandle uvm_gpu_session_handle(uvm_gpu_t *gpu)
+static uvmGpuSessionHandle uvm_global_session_handle(void)
 {
-    if (gpu->parent->smc.enabled)
-        return gpu->smc.rm_session_handle;
    return g_uvm_global.rm_session_handle;
 }

--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@@ -99,8 +99,8 @@ static void fill_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_in
    parent_gpu->system_bus.link_rate_mbyte_per_s = gpu_info->sysmemLinkRateMBps;

    if (gpu_info->systemMemoryWindowSize > 0) {
-        // memory_window_end is inclusive but uvm_gpu_is_coherent() checks
-        // memory_window_end > memory_window_start as its condition.
+        // memory_window_end is inclusive but uvm_parent_gpu_is_coherent()
+        // checks memory_window_end > memory_window_start as its condition.
        UVM_ASSERT(gpu_info->systemMemoryWindowSize > 1);
        parent_gpu->system_bus.memory_window_start = gpu_info->systemMemoryWindowStart;
        parent_gpu->system_bus.memory_window_end   = gpu_info->systemMemoryWindowStart +
@@ -136,12 +136,12 @@ static NV_STATUS get_gpu_caps(uvm_gpu_t *gpu)
        return status;

    if (gpu_caps.numaEnabled) {
-        UVM_ASSERT(uvm_gpu_is_coherent(gpu->parent));
+        UVM_ASSERT(uvm_parent_gpu_is_coherent(gpu->parent));
        gpu->mem_info.numa.enabled = true;
        gpu->mem_info.numa.node_id = gpu_caps.numaNodeId;
    }
    else {
-        UVM_ASSERT(!uvm_gpu_is_coherent(gpu->parent));
+        UVM_ASSERT(!uvm_parent_gpu_is_coherent(gpu->parent));
    }

    return NV_OK;
@@ -1089,7 +1089,7 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
 {
    NV_STATUS status;

-    status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(g_uvm_global.rm_session_handle,
+    status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
                                                           gpu_info,
                                                           gpu_uuid,
                                                           &parent_gpu->rm_device,
@@ -1099,7 +1099,12 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
        return status;
    }

-    uvm_conf_computing_check_parent_gpu(parent_gpu);
+    status = uvm_conf_computing_init_parent_gpu(parent_gpu);
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("Confidential computing: %s, GPU %s\n",
+                      nvstatusToString(status), parent_gpu->name);
+        return status;
+    }

    parent_gpu->pci_dev = gpu_platform_info->pci_dev;
    parent_gpu->closest_cpu_numa_node = dev_to_node(&parent_gpu->pci_dev->dev);
@@ -1161,19 +1166,8 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
 {
    NV_STATUS status;

-    // Presently, an RM client can only subscribe to a single partition per
-    // GPU. Therefore, UVM needs to create several RM clients. For simplicity,
-    // and since P2P is not supported when SMC partitions are created, we
-    // create a client (session) per GPU partition.
    if (gpu->parent->smc.enabled) {
-        UvmPlatformInfo platform_info;
-        status = uvm_rm_locked_call(nvUvmInterfaceSessionCreate(&gpu->smc.rm_session_handle, &platform_info));
-        if (status != NV_OK) {
-            UVM_ERR_PRINT("Creating RM session failed: %s\n", nvstatusToString(status));
-            return status;
-        }
-
-        status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_gpu_session_handle(gpu),
+        status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
                                                               gpu_info,
                                                               uvm_gpu_uuid(gpu),
                                                               &gpu->smc.rm_device,
@@ -1543,9 +1537,6 @@ static void deinit_gpu(uvm_gpu_t *gpu)
    if (gpu->parent->smc.enabled) {
        if (gpu->smc.rm_device != 0)
            uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
-
-        if (gpu->smc.rm_session_handle != 0)
-            uvm_rm_locked_call_void(nvUvmInterfaceSessionDestroy(gpu->smc.rm_session_handle));
    }

    gpu->magic = 0;
@@ -2575,7 +2566,7 @@ static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
        uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
        uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);

-        uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_gpu_session_handle(gpu0), p2p_handle));
+        uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), p2p_handle));

        UVM_ASSERT(uvm_gpu_get(gpu0->global_id) == gpu0);
        UVM_ASSERT(uvm_gpu_get(gpu1->global_id) == gpu1);
@@ -2701,9 +2692,9 @@ uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_p
    return id;
 }

-uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id1, const uvm_gpu_id_t gpu_id2)
+uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1)
 {
-    NvU32 table_index = uvm_gpu_peer_table_index(gpu_id1, gpu_id2);
+    NvU32 table_index = uvm_gpu_peer_table_index(gpu_id0, gpu_id1);
    return &g_uvm_global.peers[table_index];
 }

--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -167,7 +167,7 @@ struct uvm_service_block_context_struct
    } per_processor_masks[UVM_ID_MAX_PROCESSORS];

    // State used by the VA block routines called by the servicing routine
-    uvm_va_block_context_t block_context;
+    uvm_va_block_context_t *block_context;

    // Prefetch state hint
    uvm_perf_prefetch_hint_t prefetch_hint;
@@ -263,7 +263,10 @@ struct uvm_fault_service_batch_context_struct

    NvU32 num_coalesced_faults;

-    bool has_fatal_faults;
+    // One of the VA spaces in this batch which had fatal faults. If NULL, no
+    // faults were fatal. More than one VA space could have fatal faults, but we
+    // pick one to be the target of the cancel sequence.
+    uvm_va_space_t *fatal_va_space;

    bool has_throttled_faults;

@@ -825,8 +828,6 @@ struct uvm_gpu_struct
    {
        NvU32 swizz_id;

-        uvmGpuSessionHandle rm_session_handle;
-
        // RM device handle used in many of the UVM/RM APIs.
        //
        // Do not read this field directly, use uvm_gpu_device_handle instead.
@@ -1162,6 +1163,16 @@ struct uvm_parent_gpu_struct
        NvU64 memory_window_start;
        NvU64 memory_window_end;
    } system_bus;
+
+    // WAR to issue ATS TLB invalidation commands ourselves.
+    struct
+    {
+        uvm_mutex_t smmu_lock;
+        struct page *smmu_cmdq;
+        void __iomem *smmu_cmdqv_base;
+        unsigned long smmu_prod;
+        unsigned long smmu_cons;
+    } smmu_war;
 };

 static const char *uvm_gpu_name(uvm_gpu_t *gpu)
@@ -1336,7 +1347,7 @@ static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);

 // Calculates peer table index using GPU ids.
-NvU32 uvm_gpu_peer_table_index(uvm_gpu_id_t gpu_id1, uvm_gpu_id_t gpu_id2);
+NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);

 // Either retains an existing PCIe peer entry or creates a new one. In both
 // cases the two GPUs are also each retained.
@@ -1355,7 +1366,7 @@ uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu
 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);

 // Get the P2P capabilities between the gpus with the given indexes
-uvm_gpu_peer_t *uvm_gpu_index_peer_caps(uvm_gpu_id_t gpu_id1, uvm_gpu_id_t gpu_id2);
+uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);

 // Get the P2P capabilities between the given gpus
 static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
@@ -1363,10 +1374,10 @@ static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t
    return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id);
 }

-static bool uvm_gpus_are_nvswitch_connected(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
+static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
 {
-    if (gpu1->parent->nvswitch_info.is_nvswitch_connected && gpu2->parent->nvswitch_info.is_nvswitch_connected) {
-        UVM_ASSERT(uvm_gpu_peer_caps(gpu1, gpu2)->link_type >= UVM_GPU_LINK_NVLINK_2);
+    if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) {
+        UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
        return true;
    }

@@ -1511,7 +1522,7 @@ bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
 // addresses.
 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);

-static bool uvm_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
+static bool uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
 {
    return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start;
 }
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@@ -985,7 +985,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
        return NV_OK;

    if (uvm_processor_mask_test(&va_block->resident, processor))
-        residency_mask = uvm_va_block_resident_mask_get(va_block, processor);
+        residency_mask = uvm_va_block_resident_mask_get(va_block, processor, NUMA_NO_NODE);
    else
        residency_mask = NULL;

@@ -1036,8 +1036,8 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,

        // If the underlying VMA is gone, skip HMM migrations.
        if (uvm_va_block_is_hmm(va_block)) {
-            status = uvm_hmm_find_vma(service_context->block_context.mm,
-                                      &service_context->block_context.hmm.vma,
+            status = uvm_hmm_find_vma(service_context->block_context->mm,
+                                      &service_context->block_context->hmm.vma,
                                      address);
            if (status == NV_ERR_INVALID_ADDRESS)
                continue;
@@ -1048,7 +1048,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
        policy = uvm_va_policy_get(va_block, address);

        new_residency = uvm_va_block_select_residency(va_block,
-                                                      &service_context->block_context,
+                                                      service_context->block_context,
                                                      page_index,
                                                      processor,
                                                      uvm_fault_access_type_mask_bit(UVM_FAULT_ACCESS_TYPE_PREFETCH),
@@ -1083,7 +1083,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
        // Remove pages that are already resident in the destination processors
        for_each_id_in_mask(id, &update_processors) {
            bool migrate_pages;
-            uvm_page_mask_t *residency_mask = uvm_va_block_resident_mask_get(va_block, id);
+            uvm_page_mask_t *residency_mask = uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE);
            UVM_ASSERT(residency_mask);

            migrate_pages = uvm_page_mask_andnot(&service_context->per_processor_masks[uvm_id_value(id)].new_residency,
@@ -1101,9 +1101,9 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,

                if (uvm_va_block_is_hmm(va_block)) {
                    status = NV_ERR_INVALID_ADDRESS;
-                    if (service_context->block_context.mm) {
+                    if (service_context->block_context->mm) {
                        status = uvm_hmm_find_policy_vma_and_outer(va_block,
-                                                                   &service_context->block_context.hmm.vma,
+                                                                   &service_context->block_context->hmm.vma,
                                                                   first_page_index,
                                                                   &policy,
                                                                   &outer);
@@ -1206,7 +1206,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,

        service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
        service_context->num_retries = 0;
-        service_context->block_context.mm = mm;
+        service_context->block_context->mm = mm;

        if (uvm_va_block_is_hmm(va_block)) {
            uvm_hmm_service_context_init(service_context);
--- a/kernel-open/nvidia-uvm/uvm_gpu_isr.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_isr.c
@@ -292,6 +292,7 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status = NV_OK;
    char kthread_name[TASK_COMM_LEN + 1];
+    uvm_va_block_context_t *block_context;

    if (parent_gpu->replayable_faults_supported) {
        status = uvm_gpu_fault_buffer_init(parent_gpu);
@@ -311,6 +312,12 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
        if (!parent_gpu->isr.replayable_faults.stats.cpu_exec_count)
            return NV_ERR_NO_MEMORY;

+        block_context = uvm_va_block_context_alloc(NULL);
+        if (!block_context)
+            return NV_ERR_NO_MEMORY;
+
+        parent_gpu->fault_buffer_info.replayable.block_service_context.block_context = block_context;
+
        parent_gpu->isr.replayable_faults.handling = true;

        snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u BH", uvm_id_value(parent_gpu->id));
@@ -333,6 +340,12 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
            if (!parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count)
                return NV_ERR_NO_MEMORY;

+            block_context = uvm_va_block_context_alloc(NULL);
+            if (!block_context)
+                return NV_ERR_NO_MEMORY;
+
+            parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context = block_context;
+
            parent_gpu->isr.non_replayable_faults.handling = true;

            snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u KC", uvm_id_value(parent_gpu->id));
@@ -356,6 +369,13 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
                return status;
            }

+            block_context = uvm_va_block_context_alloc(NULL);
+            if (!block_context)
+                return NV_ERR_NO_MEMORY;
+
+            parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context =
+                block_context;
+
            nv_kthread_q_item_init(&parent_gpu->isr.access_counters.bottom_half_q_item,
                                   access_counters_isr_bottom_half_entry,
                                   parent_gpu);
@@ -410,6 +430,8 @@ void uvm_gpu_disable_isr(uvm_parent_gpu_t *parent_gpu)

 void uvm_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
 {
+    uvm_va_block_context_t *block_context;
+
    // Return ownership to RM:
    if (parent_gpu->isr.replayable_faults.was_handling) {
        // No user threads could have anything left on
@@ -439,8 +461,18 @@ void uvm_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
        // It is safe to deinitialize access counters even if they have not been
        // successfully initialized.
        uvm_gpu_deinit_access_counters(parent_gpu);
+        block_context =
+            parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context;
+        uvm_va_block_context_free(block_context);
    }

+    if (parent_gpu->non_replayable_faults_supported) {
+        block_context = parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context;
+        uvm_va_block_context_free(block_context);
+    }
+
+    block_context = parent_gpu->fault_buffer_info.replayable.block_service_context.block_context;
+    uvm_va_block_context_free(block_context);
    uvm_kvfree(parent_gpu->isr.replayable_faults.stats.cpu_exec_count);
    uvm_kvfree(parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count);
    uvm_kvfree(parent_gpu->isr.access_counters.stats.cpu_exec_count);
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@@ -370,7 +370,7 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,

    // Check logical permissions
    status = uvm_va_block_check_logical_permissions(va_block,
-                                                    &service_context->block_context,
+                                                    service_context->block_context,
                                                    gpu->id,
                                                    uvm_va_block_cpu_page_index(va_block,
                                                                                fault_entry->fault_address),
@@ -393,7 +393,7 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,

    // Compute new residency and update the masks
    new_residency = uvm_va_block_select_residency(va_block,
-                                                  &service_context->block_context,
+                                                  service_context->block_context,
                                                  page_index,
                                                  gpu->id,
                                                  fault_entry->access_type_mask,
@@ -629,7 +629,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
    uvm_gpu_va_space_t *gpu_va_space;
    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
    uvm_va_block_context_t *va_block_context =
-        &gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
+        gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;

    status = uvm_gpu_fault_entry_to_va_space(gpu, fault_entry, &va_space);
    if (status != NV_OK) {
@@ -655,7 +655,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
    // to remain valid until we release. If no mm is registered, we
    // can only service managed faults, not ATS/HMM faults.
    mm = uvm_va_space_mm_retain_lock(va_space);
-    va_block_context->mm = mm;
+    uvm_va_block_context_init(va_block_context, mm);

    uvm_va_space_down_read(va_space);

--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -1180,7 +1180,11 @@ static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
    fault_entry->replayable.cancel_va_mode = cancel_va_mode;

    utlb->has_fatal_faults = true;
-    batch_context->has_fatal_faults = true;
+
+    if (!batch_context->fatal_va_space) {
+        UVM_ASSERT(fault_entry->va_space);
+        batch_context->fatal_va_space = fault_entry->va_space;
+    }
 }

 static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
@@ -1230,7 +1234,7 @@ static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
    UvmEventFatalReason fatal_reason;
    uvm_fault_cancel_va_mode_t cancel_va_mode;
    uvm_fault_access_type_t ret = UVM_FAULT_ACCESS_TYPE_COUNT;
-    uvm_va_block_context_t *va_block_context = &service_block_context->block_context;
+    uvm_va_block_context_t *va_block_context = service_block_context->block_context;

    perm_status = uvm_va_block_check_logical_permissions(va_block,
                                                         va_block_context,
@@ -1345,7 +1349,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

    if (uvm_va_block_is_hmm(va_block)) {
        policy = uvm_hmm_find_policy_end(va_block,
-                                         block_context->block_context.hmm.vma,
+                                         block_context->block_context->hmm.vma,
                                         ordered_fault_cache[first_fault_index]->fault_address,
                                         &end);
    }
@@ -1469,7 +1473,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

        // Compute new residency and update the masks
        new_residency = uvm_va_block_select_residency(va_block,
-                                                      &block_context->block_context,
+                                                      block_context->block_context,
                                                      page_index,
                                                      gpu->id,
                                                      service_access_type_mask,
@@ -1511,8 +1515,8 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

    ++block_context->num_retries;

-    if (status == NV_OK && batch_context->has_fatal_faults)
-        status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);
+    if (status == NV_OK && batch_context->fatal_va_space)
+        status = uvm_va_block_set_cancel(va_block, block_context->block_context, gpu);

    return status;
 }
@@ -1860,7 +1864,7 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
    uvm_va_block_t *va_block;
    uvm_gpu_t *gpu = gpu_va_space->gpu;
    uvm_va_block_context_t *va_block_context =
-        &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
+        gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
    uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[fault_index];
    struct mm_struct *mm = va_block_context->mm;
    NvU64 fault_address = current_entry->fault_address;
@@ -1937,14 +1941,198 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
    return status;
 }

+// Called when a fault in the batch has been marked fatal. Flush the buffer
+// under the VA and mmap locks to remove any potential stale fatal faults, then
+// service all new faults for just that VA space and cancel those which are
+// fatal. Faults in other VA spaces are replayed when done and will be processed
+// when normal fault servicing resumes.
+static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
+{
+    NV_STATUS status = NV_OK;
+    NvU32 i;
+    uvm_va_space_t *va_space = batch_context->fatal_va_space;
+    uvm_gpu_va_space_t *gpu_va_space = NULL;
+    struct mm_struct *mm;
+    uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
+    uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
+    uvm_va_block_context_t *va_block_context = service_context->block_context;
+
+    UVM_ASSERT(gpu->parent->replayable_faults_supported);
+    UVM_ASSERT(va_space);
+
+    // Perform the flush and re-fetch while holding the mmap_lock and the
+    // VA space lock. This avoids stale faults because it prevents any vma
+    // modifications (mmap, munmap, mprotect) from happening between the time HW
+    // takes the fault and we cancel it.
+    mm = uvm_va_space_mm_retain_lock(va_space);
+    uvm_va_block_context_init(va_block_context, mm);
+    uvm_va_space_down_read(va_space);
+
+    // We saw fatal faults in this VA space before. Flush while holding
+    // mmap_lock to make sure those faults come back (aren't stale).
+    //
+    // We need to wait until all old fault messages have arrived before
+    // flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
+    status = fault_buffer_flush_locked(gpu,
+                                       UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
+                                       UVM_FAULT_REPLAY_TYPE_START,
+                                       batch_context);
+    if (status != NV_OK)
+        goto done;
+
+    // Wait for the flush's replay to finish to give the legitimate faults a
+    // chance to show up in the buffer again.
+    status = uvm_tracker_wait(&replayable_faults->replay_tracker);
+    if (status != NV_OK)
+        goto done;
+
+    // We expect all replayed faults to have arrived in the buffer so we can re-
+    // service them. The replay-and-wait sequence above will ensure they're all
+    // in the HW buffer. When GSP owns the HW buffer, we also have to wait for
+    // GSP to copy all available faults from the HW buffer into the shadow
+    // buffer.
+    //
+    // TODO: Bug 2533557: This flush does not actually guarantee that GSP will
+    //       copy over all faults.
+    status = hw_fault_buffer_flush_locked(gpu->parent);
+    if (status != NV_OK)
+        goto done;
+
+    // If there is no GPU VA space for the GPU, ignore all faults in the VA
+    // space. This can happen if the GPU VA space has been destroyed since we
+    // unlocked the VA space in service_fault_batch. That means the fatal faults
+    // are stale, because unregistering the GPU VA space requires preempting the
+    // context and detaching all channels in that VA space. Restart fault
+    // servicing from the top.
+    gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
+    if (!gpu_va_space)
+        goto done;
+
+    // Re-parse the new faults
+    batch_context->num_invalid_prefetch_faults = 0;
+    batch_context->num_duplicate_faults        = 0;
+    batch_context->num_replays                 = 0;
+    batch_context->fatal_va_space              = NULL;
+    batch_context->has_throttled_faults        = false;
+
+    status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
+    if (status != NV_OK)
+        goto done;
+
+    // No more faults left. Either the previously-seen fatal entry was stale, or
+    // RM killed the context underneath us.
+    if (batch_context->num_cached_faults == 0)
+        goto done;
+
+    ++batch_context->batch_id;
+
+    status = preprocess_fault_batch(gpu, batch_context);
+    if (status != NV_OK) {
+        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
+            // Another flush happened due to stale faults or a context-fatal
+            // error. The previously-seen fatal fault might not exist anymore,
+            // so restart fault servicing from the top.
+            status = NV_OK;
+        }
+
+        goto done;
+    }
+
+    // Search for the target VA space
+    for (i = 0; i < batch_context->num_coalesced_faults; i++) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+        UVM_ASSERT(current_entry->va_space);
+        if (current_entry->va_space == va_space)
+            break;
+    }
+
+    while (i < batch_context->num_coalesced_faults) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+
+        if (current_entry->va_space != va_space)
+            break;
+
+        // service_fault_batch_dispatch() doesn't expect unserviceable faults.
+        // Just cancel them directly.
+        if (current_entry->is_fatal) {
+            status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
+            if (status != NV_OK)
+                break;
+
+            ++i;
+        }
+        else {
+            uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
+            NvU32 block_faults;
+
+            ats_invalidate->write_faults_in_batch = false;
+            uvm_hmm_service_context_init(service_context);
+
+            // Service all the faults that we can. We only really need to search
+            // for fatal faults, but attempting to service all is the easiest
+            // way to do that.
+            status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false);
+            if (status != NV_OK) {
+                // TODO: Bug 3900733: clean up locking in service_fault_batch().
+                // We need to drop lock and retry. That means flushing and
+                // starting over.
+                if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
+                    status = NV_OK;
+
+                break;
+            }
+
+            // Invalidate TLBs before cancel to ensure that fatal faults don't
+            // get stuck in HW behind non-fatal faults to the same line.
+            status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
+            if (status != NV_OK)
+                break;
+
+            while (block_faults-- > 0) {
+                current_entry = batch_context->ordered_fault_cache[i];
+                if (current_entry->is_fatal) {
+                    status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
+                    if (status != NV_OK)
+                        break;
+                }
+
+                ++i;
+            }
+        }
+    }
+
+done:
+    uvm_va_space_up_read(va_space);
+    uvm_va_space_mm_release_unlock(va_space, mm);
+
+    if (status == NV_OK) {
+        // There are two reasons to flush the fault buffer here.
+        //
+        // 1) Functional. We need to replay both the serviced non-fatal faults
+        //    and the skipped faults in other VA spaces. The former need to be
+        //    restarted and the latter need to be replayed so the normal fault
+        //    service mechanism can fetch and process them.
+        //
+        // 2) Performance. After cancelling the fatal faults, a flush removes
+        //    any potential duplicated fault that may have been added while
+        //    processing the faults in this batch. This flush also avoids doing
+        //    unnecessary processing after the fatal faults have been cancelled,
+        //    so all the rest are unlikely to remain after a replay because the
+        //    context is probably in the process of dying.
+        status = fault_buffer_flush_locked(gpu,
+                                           UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
+                                           UVM_FAULT_REPLAY_TYPE_START,
+                                           batch_context);
+    }
+
+    return status;
+}
 // Scan the ordered view of faults and group them by different va_blocks
 // (managed faults) and service faults for each va_block, in batch.
 // Service non-managed faults one at a time as they are encountered during the
 // scan.
 //
-// This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
-// was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
-// space
+// Fatal faults are marked for later processing by the caller.
 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
                                     fault_service_mode_t service_mode,
                                     uvm_fault_service_batch_context_t *batch_context)
@@ -1959,7 +2147,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
                                     gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK;
    uvm_service_block_context_t *service_context =
        &gpu->parent->fault_buffer_info.replayable.block_service_context;
-    uvm_va_block_context_t *va_block_context = &service_context->block_context;
+    uvm_va_block_context_t *va_block_context = service_context->block_context;

    UVM_ASSERT(gpu->parent->replayable_faults_supported);

@@ -1995,41 +2183,28 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            // to remain valid until we release. If no mm is registered, we
            // can only service managed faults, not ATS/HMM faults.
            mm = uvm_va_space_mm_retain_lock(va_space);
-            va_block_context->mm = mm;
+            uvm_va_block_context_init(va_block_context, mm);

            uvm_va_space_down_read(va_space);
-
            gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
-            if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
-                status = fault_buffer_flush_locked(gpu,
-                                                   UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
-                                                   UVM_FAULT_REPLAY_TYPE_START,
-                                                   batch_context);
-                if (status == NV_OK)
-                    status = NV_WARN_MORE_PROCESSING_REQUIRED;
-
-                break;
-            }
-
-            // The case where there is no valid GPU VA space for the GPU in this
-            // VA space is handled next
        }

        // Some faults could be already fatal if they cannot be handled by
        // the UVM driver
        if (current_entry->is_fatal) {
            ++i;
-            batch_context->has_fatal_faults = true;
+            if (!batch_context->fatal_va_space)
+                batch_context->fatal_va_space = va_space;
+
            utlb->has_fatal_faults = true;
            UVM_ASSERT(utlb->num_pending_faults > 0);
            continue;
        }

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
+        if (!gpu_va_space) {
            // If there is no GPU VA space for the GPU, ignore the fault. This
            // can happen if a GPU VA space is destroyed without explicitly
-            // freeing all memory ranges (destroying the VA range triggers a
-            // flush of the fault buffer) and there are stale entries in the
+            // freeing all memory ranges and there are stale entries in the
            // buffer that got fixed by the servicing in a previous batch.
            ++i;
            continue;
@@ -2057,7 +2232,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
        i += block_faults;

        // Don't issue replays in cancel mode
-        if (replay_per_va_block && !batch_context->has_fatal_faults) {
+        if (replay_per_va_block && !batch_context->fatal_va_space) {
            status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
            if (status != NV_OK)
                goto fail;
@@ -2069,8 +2244,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
        }
    }

-    // Only clobber status if invalidate_status != NV_OK, since status may also
-    // contain NV_WARN_MORE_PROCESSING_REQUIRED.
    if (va_space != NULL) {
        NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
        if (invalidate_status != NV_OK)
@@ -2278,64 +2451,6 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
    return false;
 }

-// Cancel just the faults flagged as fatal in the given fault service batch
-// context.
-static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
-{
-    NV_STATUS status = NV_OK;
-    NV_STATUS fault_status;
-    uvm_va_space_t *va_space = NULL;
-    NvU32 i;
-
-    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
-
-    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
-        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
-
-        UVM_ASSERT(current_entry->va_space);
-
-        if (current_entry->va_space != va_space) {
-            // Fault on a different va_space, drop the lock of the old one...
-            if (va_space != NULL)
-                uvm_va_space_up_read(va_space);
-
-            va_space = current_entry->va_space;
-
-            // ... and take the lock of the new one
-            uvm_va_space_down_read(va_space);
-
-            // We don't need to check whether a buffer flush is required
-            // (due to VA range destruction). Once a fault is flagged as fatal
-            // we need to cancel it, even if its VA range no longer exists.
-        }
-
-        // See the comment for the same check in cancel_faults_all
-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id))
-            continue;
-
-        if (current_entry->is_fatal) {
-            status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
-            if (status != NV_OK)
-                break;
-        }
-    }
-
-    if (va_space != NULL)
-        uvm_va_space_up_read(va_space);
-
-    // See the comment on flushing in cancel_faults_all
-    fault_status = fault_buffer_flush_locked(gpu,
-                                             UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
-                                             UVM_FAULT_REPLAY_TYPE_START,
-                                             batch_context);
-
-    // We report the first encountered error.
-    if (status == NV_OK)
-        status = fault_status;
-
-    return status;
-}
-
 // Cancel all faults in the given fault service batch context, even those not
 // marked as fatal.
 static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
@@ -2344,56 +2459,51 @@ static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
 {
    NV_STATUS status = NV_OK;
    NV_STATUS fault_status;
-    uvm_va_space_t *va_space = NULL;
-    NvU32 i;
+    NvU32 i = 0;

    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
    UVM_ASSERT(reason != UvmEventFatalReasonInvalid);

-    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
+    while (i < batch_context->num_coalesced_faults && status == NV_OK) {
        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
-        uvm_fault_cancel_va_mode_t cancel_va_mode;
+        uvm_va_space_t *va_space = current_entry->va_space;
+        bool skip_va_space;

-        UVM_ASSERT(current_entry->va_space);
+        UVM_ASSERT(va_space);

-        if (current_entry->va_space != va_space) {
-            // Fault on a different va_space, drop the lock of the old one...
-            if (va_space != NULL)
-                uvm_va_space_up_read(va_space);
+        uvm_va_space_down_read(va_space);

-            va_space = current_entry->va_space;
+        // If there is no GPU VA space for the GPU, ignore all faults in
+        // that VA space. This can happen if the GPU VA space has been
+        // destroyed since we unlocked the VA space in service_fault_batch.
+        // Ignoring the fault avoids targetting a PDB that might have been
+        // reused by another process.
+        skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);

-            // ... and take the lock of the new one
-            uvm_va_space_down_read(va_space);
+        for (;
+             i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
+             current_entry = batch_context->ordered_fault_cache[++i]) {
+            uvm_fault_cancel_va_mode_t cancel_va_mode;
+
+            if (skip_va_space)
+                continue;
+
+            if (current_entry->is_fatal) {
+                UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
+                cancel_va_mode = current_entry->replayable.cancel_va_mode;
+            }
+            else {
+                current_entry->fatal_reason = reason;
+                cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
+            }
+
+            status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
+            if (status != NV_OK)
+                break;
        }

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
-            // If there is no GPU VA space for the GPU, ignore the fault.
-            // This can happen if the GPU VA did not exist in
-            // service_fault_batch(), or it was destroyed since then.
-            // This is to avoid targetting a PDB that might have been reused
-            // by another process.
-            continue;
-        }
-
-        // If the fault was already marked fatal, use its reason and cancel
-        // mode. Otherwise use the provided reason.
-        if (current_entry->is_fatal) {
-            UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
-            cancel_va_mode = current_entry->replayable.cancel_va_mode;
-        }
-        else {
-            current_entry->fatal_reason = reason;
-            cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
-        }
-
-        status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
-        if (status != NV_OK)
-            break;
-    }
-
-    if (va_space != NULL)
        uvm_va_space_up_read(va_space);
+    }

    // Because each cancel itself triggers a replay, there may be a large number
    // of new duplicated faults in the buffer after cancelling all the known
@@ -2537,7 +2647,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        // 5) Fetch all faults from buffer
@@ -2584,9 +2694,6 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
        // 8) Service all non-fatal faults and mark all non-serviceable faults
        // as fatal
        status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        UVM_ASSERT(batch_context->num_replays == 0);
        if (status == NV_ERR_NO_MEMORY)
            continue;
@@ -2594,7 +2701,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
            break;

        // No more fatal faults left, we are done
-        if (!batch_context->has_fatal_faults)
+        if (!batch_context->fatal_va_space)
            break;

        // 9) Search for uTLBs that contain fatal faults and meet the
@@ -2616,9 +2723,9 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
 {
-    UVM_ASSERT(batch_context->has_fatal_faults);
+    UVM_ASSERT(batch_context->fatal_va_space);
    if (gpu->parent->fault_cancel_va_supported)
-        return cancel_faults_precise_va(gpu, batch_context);
+        return service_fault_batch_for_cancel(gpu, batch_context);

    return cancel_faults_precise_tlb(gpu, batch_context);
 }
@@ -2674,7 +2781,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_duplicate_faults        = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
@@ -2702,9 +2809,6 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        // was flushed
        num_replays += batch_context->num_replays;

-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        enable_disable_prefetch_faults(gpu->parent, batch_context);

        if (status != NV_OK) {
@@ -2718,10 +2822,17 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
            break;
        }

-        if (batch_context->has_fatal_faults) {
+        if (batch_context->fatal_va_space) {
            status = uvm_tracker_wait(&batch_context->tracker);
-            if (status == NV_OK)
+            if (status == NV_OK) {
                status = cancel_faults_precise(gpu, batch_context);
+                if (status == NV_OK) {
+                    // Cancel handling should've issued at least one replay
+                    UVM_ASSERT(batch_context->num_replays > 0);
+                    ++num_batches;
+                    continue;
+                }
+            }

            break;
        }
--- a/kernel-open/nvidia-uvm/uvm_hal.c
+++ b/kernel-open/nvidia-uvm/uvm_hal.c
@@ -794,7 +794,7 @@ uvm_membar_t uvm_hal_downgrade_membar_type(uvm_gpu_t *gpu, bool is_local_vidmem)
    // memory, including those from other processors like the CPU or peer GPUs,
    // must come through this GPU's L2. In all current architectures, MEMBAR_GPU
    // is sufficient to resolve ordering at the L2 level.
-    if (is_local_vidmem && !uvm_gpu_is_coherent(gpu->parent) && !uvm_downgrade_force_membar_sys)
+    if (is_local_vidmem && !uvm_parent_gpu_is_coherent(gpu->parent) && !uvm_downgrade_force_membar_sys)
        return UVM_MEMBAR_GPU;

    // If the mapped memory was remote, or if a coherence protocol can cache
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -60,6 +60,8 @@ module_param(uvm_disable_hmm, bool, 0444);
 #include "uvm_gpu.h"
 #include "uvm_pmm_gpu.h"
 #include "uvm_hal_types.h"
+#include "uvm_push.h"
+#include "uvm_hal.h"
 #include "uvm_va_block_types.h"
 #include "uvm_va_space_mm.h"
 #include "uvm_va_space.h"
@@ -110,20 +112,7 @@ typedef struct

 bool uvm_hmm_is_enabled_system_wide(void)
 {
-    if (uvm_disable_hmm)
-        return false;
-
-    if (g_uvm_global.ats.enabled)
-        return false;
-
-    // Confidential Computing and HMM impose mutually exclusive constraints. In
-    // Confidential Computing the GPU can only access pages resident in vidmem,
-    // but in HMM pages may be required to be resident in sysmem: file backed
-    // VMAs, huge pages, etc.
-    if (g_uvm_global.conf_computing_enabled)
-        return false;
-
-    return uvm_va_space_mm_enabled_system();
+    return !uvm_disable_hmm && !g_uvm_global.ats.enabled && uvm_va_space_mm_enabled_system();
 }

 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
@@ -140,6 +129,100 @@ static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node)
    return container_of(node, uvm_va_block_t, hmm.node);
 }

+// Copies the contents of the source device-private page to the
+// destination CPU page. This will invalidate mappings, so cannot be
+// called while holding any va_block locks.
+static NV_STATUS uvm_hmm_copy_devmem_page(struct page *dst_page, struct page *src_page, uvm_tracker_t *tracker)
+{
+    uvm_gpu_phys_address_t src_addr;
+    uvm_gpu_phys_address_t dst_addr;
+    uvm_gpu_chunk_t *gpu_chunk;
+    NvU64 dma_addr;
+    uvm_push_t push;
+    NV_STATUS status = NV_OK;
+    uvm_gpu_t *gpu;
+
+    // Holding a reference on the device-private page ensures the gpu
+    // is already retained. This is because when a GPU is unregistered
+    // all device-private pages are migrated back to the CPU and freed
+    // before releasing the GPU. Therefore if we could get a reference
+    // to the page the GPU must be retained.
+    UVM_ASSERT(is_device_private_page(src_page) && page_count(src_page));
+    gpu_chunk = uvm_pmm_devmem_page_to_chunk(src_page);
+    gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
+    status = uvm_mmu_chunk_map(gpu_chunk);
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_gpu_map_cpu_pages(gpu->parent, dst_page, PAGE_SIZE, &dma_addr);
+    if (status != NV_OK)
+        goto out_unmap_gpu;
+
+    dst_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
+    src_addr = uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_chunk->address);
+    status = uvm_push_begin_acquire(gpu->channel_manager,
+                                    UVM_CHANNEL_TYPE_GPU_TO_CPU,
+                                    tracker,
+                                    &push,
+                                    "Copy for remote process fault");
+    if (status != NV_OK)
+        goto out_unmap_cpu;
+
+    gpu->parent->ce_hal->memcopy(&push,
+                                 uvm_gpu_address_copy(gpu, dst_addr),
+                                 uvm_gpu_address_copy(gpu, src_addr),
+                                 PAGE_SIZE);
+    uvm_push_end(&push);
+    status = uvm_tracker_add_push_safe(tracker, &push);
+
+out_unmap_cpu:
+    uvm_gpu_unmap_cpu_pages(gpu->parent, dma_addr, PAGE_SIZE);
+
+out_unmap_gpu:
+    uvm_mmu_chunk_unmap(gpu_chunk, NULL);
+
+    return status;
+}
+
+static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
+{
+    unsigned long src_pfn = 0;
+    unsigned long dst_pfn = 0;
+    struct page *dst_page;
+    NV_STATUS status = NV_OK;
+    int ret;
+
+    ret = migrate_device_range(&src_pfn, pfn, 1);
+    if (ret)
+        return errno_to_nv_status(ret);
+
+    if (src_pfn & MIGRATE_PFN_MIGRATE) {
+        uvm_tracker_t tracker = UVM_TRACKER_INIT();
+
+        dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
+        if (!dst_page) {
+            status = NV_ERR_NO_MEMORY;
+            goto out;
+        }
+
+        lock_page(dst_page);
+        if (WARN_ON(uvm_hmm_copy_devmem_page(dst_page, migrate_pfn_to_page(src_pfn), &tracker) != NV_OK))
+            memzero_page(dst_page, 0, PAGE_SIZE);
+
+        dst_pfn = migrate_pfn(page_to_pfn(dst_page));
+        migrate_device_pages(&src_pfn, &dst_pfn, 1);
+        uvm_tracker_wait_deinit(&tracker);
+    }
+
+out:
+    migrate_device_finalize(&src_pfn, &dst_pfn, 1);
+
+    if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+        status = NV_ERR_BUSY_RETRY;
+
+    return status;
+}
+
 void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
 {
    uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
@@ -199,6 +282,9 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
 {
    uvm_range_tree_node_t *node;
    uvm_va_block_t *va_block;
+    struct range range = gpu->pmm.devmem.pagemap.range;
+    unsigned long pfn;
+    bool retry;

    if (!uvm_hmm_is_enabled(va_space))
        return;
@@ -207,6 +293,29 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
        uvm_assert_mmap_lock_locked(mm);
    uvm_assert_rwsem_locked_write(&va_space->lock);

+    // There could be pages with page->zone_device_data pointing to the va_space
+    // which may be about to be freed. Migrate those back to the CPU so we don't
+    // fault on them. Normally infinite retries are bad, but we don't have any
+    // option here. Device-private pages can't be pinned so migration should
+    // eventually succeed. Even if we did eventually bail out of the loop we'd
+    // just stall in memunmap_pages() anyway.
+    do {
+        retry = false;
+
+        for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+            struct page *page = pfn_to_page(pfn);
+
+            UVM_ASSERT(is_device_private_page(page));
+
+            // This check is racy because nothing stops the page being freed and
+            // even reused. That doesn't matter though - worst case the
+            // migration fails, we retry and find the va_space doesn't match.
+            if (page->zone_device_data == va_space)
+                if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
+                    retry = true;
+        }
+    } while (retry);
+
    uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
        va_block = hmm_va_block_from_node(node);

@@ -568,7 +677,7 @@ bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
 void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context)
 {
    // TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
-    service_context->block_context.hmm.swap_cached = false;
+    service_context->block_context->hmm.swap_cached = false;
 }

 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
@@ -631,47 +740,6 @@ static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
    return status;
 }

-void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
-{
-    // We can't use uvm_va_space_mm_retain(), because the va_space_mm
-    // should already be dead by now.
-    struct mm_struct *mm = va_space->va_space_mm.mm;
-    uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
-    uvm_range_tree_node_t *node, *next;
-    uvm_va_block_t *va_block;
-    uvm_va_block_context_t *block_context;
-
-    uvm_down_read_mmap_lock(mm);
-    uvm_va_space_down_write(va_space);
-
-    uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
-        uvm_va_block_region_t region;
-        struct vm_area_struct *vma;
-
-        va_block = hmm_va_block_from_node(node);
-        block_context = uvm_va_space_block_context(va_space, mm);
-        uvm_hmm_migrate_begin_wait(va_block);
-        uvm_mutex_lock(&va_block->lock);
-        for_each_va_block_vma_region(va_block, mm, vma, &region) {
-            if (!uvm_hmm_vma_is_valid(vma, vma->vm_start, false))
-                continue;
-
-            block_context->hmm.vma = vma;
-            uvm_hmm_va_block_migrate_locked(va_block,
-                                            NULL,
-                                            block_context,
-                                            UVM_ID_CPU,
-                                            region,
-                                            UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
-        }
-        uvm_mutex_unlock(&va_block->lock);
-        uvm_hmm_migrate_finish(va_block);
-    }
-
-    uvm_va_space_up_write(va_space);
-    uvm_up_read_mmap_lock(mm);
-}
-
 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
 {
    uvm_va_block_test_t *block_test;
@@ -1476,40 +1544,59 @@ static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
        return status;
    }

-    status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, page_index);
+    status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, chunk, page_index);
    if (status != NV_OK) {
-        uvm_cpu_chunk_remove_from_block(va_block, page_index);
+        uvm_cpu_chunk_remove_from_block(va_block, page_to_nid(page), page_index);
        uvm_cpu_chunk_free(chunk);
    }

    return status;
 }

-static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block,
-                                             uvm_page_index_t page_index)
+static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
+                                              uvm_cpu_chunk_t *chunk,
+                                              int chunk_nid,
+                                              uvm_page_index_t page_index)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
-
-    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
-
    if (!chunk)
        return;

    UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-               !uvm_page_mask_test(&va_block->cpu.resident, page_index));
+               !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
+    UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);

-    uvm_cpu_chunk_remove_from_block(va_block, page_index);
+    uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
    uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
    uvm_cpu_chunk_free(chunk);
 }

+static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, uvm_page_index_t page_index, struct page *page)
+{
+    uvm_cpu_chunk_t *chunk;
+
+    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
+
+    if (page) {
+        chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index);
+        hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, page_to_nid(page), page_index);
+    }
+    else {
+        int nid;
+
+        for_each_possible_uvm_node(nid) {
+            chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
+            hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, nid, page_index);
+        }
+    }
+}
+
 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
                                          uvm_page_index_t page_index,
                                          struct page *page)
 {
-    struct page *old_page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
+    struct page *old_page = uvm_va_block_get_cpu_page(va_block, page_index);

-    UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_index)));
+    UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index)));
    return old_page == page;
 }

@@ -1522,7 +1609,7 @@ static void clear_service_context_masks(uvm_service_block_context_t *service_con
                                        uvm_processor_id_t new_residency,
                                        uvm_page_index_t page_index)
 {
-    uvm_page_mask_clear(&service_context->block_context.caller_page_mask, page_index);
+    uvm_page_mask_clear(&service_context->block_context->caller_page_mask, page_index);

    uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency,
                        page_index);
@@ -1549,7 +1636,6 @@ static void cpu_mapping_set(uvm_va_block_t *va_block,
                            uvm_page_index_t page_index)
 {
    uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU);
-    uvm_page_mask_set(&va_block->maybe_mapped_pages, page_index);
    uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
    if (is_write)
        uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
@@ -1699,7 +1785,7 @@ static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
            // migrate_vma_finalize() will release the reference so we should
            // clear our pointer to it.
            // TODO: Bug 3660922: Need to handle read duplication at some point.
-            hmm_va_block_cpu_page_unpopulate(va_block, page_index);
+            hmm_va_block_cpu_page_unpopulate(va_block, page_index, page);
        }
    }

@@ -1725,7 +1811,7 @@ static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
    else {
        UVM_ASSERT(page_ref_count(dst_page) == 1);

-        hmm_va_block_cpu_page_unpopulate(va_block, page_index);
+        hmm_va_block_cpu_page_unpopulate(va_block, page_index, dst_page);
    }

    unlock_page(dst_page);
@@ -1760,7 +1846,7 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
                                unsigned long *dst_pfns,
                                uvm_page_mask_t *same_devmem_page_mask)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(src_page), page_index);
    uvm_va_block_region_t chunk_region;
    struct page *dst_page;

@@ -1786,7 +1872,7 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
        // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
        // does not migrate, it will be freed though.
        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-                   !uvm_page_mask_test(&va_block->cpu.resident, page_index));
+                   !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
        UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
        UVM_ASSERT(page_ref_count(dst_page) == 1);
        uvm_cpu_chunk_make_hmm(chunk);
@@ -1934,7 +2020,7 @@ static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block,
        }

        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-                   !uvm_page_mask_test(&va_block->cpu.resident, page_index));
+                   !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));

        // Allocate a user system memory page for the destination.
        // This is the typical case since Linux will free the source page when
@@ -2012,8 +2098,8 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
    service_context = devmem_fault_context->service_context;
    va_block_retry = devmem_fault_context->va_block_retry;
    va_block = devmem_fault_context->va_block;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;

    // Build the migration page mask.
    // Note that thrashing pinned pages and prefetch pages are already
@@ -2022,7 +2108,7 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
    uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);

    status = alloc_and_copy_to_cpu(va_block,
-                                   service_context->block_context.hmm.vma,
+                                   service_context->block_context->hmm.vma,
                                   src_pfns,
                                   dst_pfns,
                                   service_context->region,
@@ -2057,8 +2143,8 @@ static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_cont
    prefetch_hint = &service_context->prefetch_hint;
    va_block = devmem_fault_context->va_block;
    va_block_retry = devmem_fault_context->va_block_retry;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;
    region = service_context->region;

    page_mask = &devmem_fault_context->page_mask;
@@ -2165,8 +2251,7 @@ static NV_STATUS populate_region(uvm_va_block_t *va_block,

        // Since we have a stable snapshot of the CPU pages, we can
        // update the residency and protection information.
-        uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
-        uvm_page_mask_set(&va_block->cpu.resident, page_index);
+        uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);

        cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
    }
@@ -2253,7 +2338,7 @@ static void hmm_release_atomic_pages(uvm_va_block_t *va_block,
    uvm_page_index_t page_index;

    for_each_va_block_page_in_region(page_index, region) {
-        struct page *page = service_context->block_context.hmm.pages[page_index];
+        struct page *page = service_context->block_context->hmm.pages[page_index];

        if (!page)
            continue;
@@ -2269,14 +2354,14 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
                                               uvm_service_block_context_t *service_context)
 {
    uvm_va_block_region_t region = service_context->region;
-    struct page **pages = service_context->block_context.hmm.pages;
+    struct page **pages = service_context->block_context->hmm.pages;
    int npages;
    uvm_page_index_t page_index;
    uvm_make_resident_cause_t cause;
    NV_STATUS status;

    if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-        !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
+        !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
        // There is an atomic GPU fault. We need to make sure no pages are
        // GPU resident so that make_device_exclusive_range() doesn't call
        // migrate_to_ram() and cause a va_space lock recursion problem.
@@ -2289,7 +2374,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,

        status = uvm_hmm_va_block_migrate_locked(va_block,
                                                 va_block_retry,
-                                                 &service_context->block_context,
+                                                 service_context->block_context,
                                                 UVM_ID_CPU,
                                                 region,
                                                 cause);
@@ -2299,7 +2384,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
        // make_device_exclusive_range() will try to call migrate_to_ram()
        // and deadlock with ourself if the data isn't CPU resident.
        if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-            !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
+            !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
            status = NV_WARN_MORE_PROCESSING_REQUIRED;
            goto done;
        }
@@ -2309,7 +2394,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
    // mmap() files so we check for that here and report a fatal fault.
    // Otherwise with the current Linux 6.1 make_device_exclusive_range(),
    // it doesn't make the page exclusive and we end up in an endless loop.
-    if (service_context->block_context.hmm.vma->vm_flags & VM_SHARED) {
+    if (service_context->block_context->hmm.vma->vm_flags & (VM_SHARED | VM_HUGETLB)) {
        status = NV_ERR_NOT_SUPPORTED;
        goto done;
    }
@@ -2318,7 +2403,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,

    uvm_mutex_unlock(&va_block->lock);

-    npages = make_device_exclusive_range(service_context->block_context.mm,
+    npages = make_device_exclusive_range(service_context->block_context->mm,
        uvm_va_block_cpu_page_address(va_block, region.first),
        uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE,
        pages + region.first,
@@ -2356,15 +2441,13 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
        if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
            UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
            UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
-            UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
+            UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
        }
        else {
            NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);

-            if (s == NV_OK) {
-                uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
-                uvm_page_mask_set(&va_block->cpu.resident, page_index);
-            }
+            if (s == NV_OK)
+                uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
        }

        cpu_mapping_clear(va_block, page_index);
@@ -2419,7 +2502,7 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
                                            uvm_service_block_context_t *service_context)
 {
    uvm_va_block_region_t region = service_context->region;
-    struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
+    struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
    NV_STATUS status;
    int ret;
    uvm_hmm_devmem_fault_context_t fault_context = {
@@ -2453,8 +2536,8 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
        }

        status = hmm_make_resident_cpu(va_block,
-                                       service_context->block_context.hmm.vma,
-                                       service_context->block_context.hmm.src_pfns,
+                                       service_context->block_context->hmm.vma,
+                                       service_context->block_context->hmm.src_pfns,
                                       region,
                                       service_context->access_type,
                                       &fault_context.same_devmem_page_mask);
@@ -2476,9 +2559,9 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
        }
    }

-    args->vma = service_context->block_context.hmm.vma;
-    args->src = service_context->block_context.hmm.src_pfns + region.first;
-    args->dst = service_context->block_context.hmm.dst_pfns + region.first;
+    args->vma = service_context->block_context->hmm.vma;
+    args->src = service_context->block_context->hmm.src_pfns + region.first;
+    args->dst = service_context->block_context->hmm.dst_pfns + region.first;
    args->start = uvm_va_block_region_start(va_block, region);
    args->end = uvm_va_block_region_end(va_block, region) + 1;
    args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
@@ -2558,7 +2641,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
                // TODO: Bug 4050579: Remove this when swap cached pages can be
                // migrated.
                if (service_context) {
-                    service_context->block_context.hmm.swap_cached = true;
+                    service_context->block_context->hmm.swap_cached = true;
                    break;
                }

@@ -2574,7 +2657,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
            if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
                UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
                UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
-                UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
+                UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
            }
            else {
                status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
@@ -2588,8 +2671,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,

                // migrate_vma_setup() was able to isolate and lock the page;
                // therefore, it is CPU resident and not mapped.
-                uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
-                uvm_page_mask_set(&va_block->cpu.resident, page_index);
+                uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index);
            }

            // The call to migrate_vma_setup() will have inserted a migration
@@ -2604,7 +2686,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
            if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
                UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));

-                hmm_va_block_cpu_page_unpopulate(va_block, page_index);
+                hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL);
            }
        }

@@ -2618,7 +2700,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
    }

    if (uvm_page_mask_empty(page_mask) ||
-        (service_context && service_context->block_context.hmm.swap_cached))
+        (service_context && service_context->block_context->hmm.swap_cached))
        status = NV_WARN_MORE_PROCESSING_REQUIRED;

    if (status != NV_OK)
@@ -2649,8 +2731,8 @@ static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
    service_context = uvm_hmm_gpu_fault_event->service_context;
    region = service_context->region;
    prefetch_hint = &service_context->prefetch_hint;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;

    // Build the migration mask.
    // Note that thrashing pinned pages are already accounted for in
@@ -2708,8 +2790,8 @@ static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *u
    va_block = uvm_hmm_gpu_fault_event->va_block;
    va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
    service_context = uvm_hmm_gpu_fault_event->service_context;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;
    region = service_context->region;
    page_mask = &uvm_hmm_gpu_fault_event->page_mask;

@@ -2752,11 +2834,11 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
                                          uvm_va_block_retry_t *va_block_retry,
                                          uvm_service_block_context_t *service_context)
 {
-    struct mm_struct *mm = service_context->block_context.mm;
-    struct vm_area_struct *vma = service_context->block_context.hmm.vma;
+    struct mm_struct *mm = service_context->block_context->mm;
+    struct vm_area_struct *vma = service_context->block_context->hmm.vma;
    uvm_va_block_region_t region = service_context->region;
    uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
-    struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
+    struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
    int ret;
    NV_STATUS status = NV_ERR_INVALID_ADDRESS;

@@ -2780,8 +2862,8 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
    uvm_hmm_gpu_fault_event.service_context = service_context;

    args->vma = vma;
-    args->src = service_context->block_context.hmm.src_pfns + region.first;
-    args->dst = service_context->block_context.hmm.dst_pfns + region.first;
+    args->src = service_context->block_context->hmm.src_pfns + region.first;
+    args->dst = service_context->block_context->hmm.dst_pfns + region.first;
    args->start = uvm_va_block_region_start(va_block, region);
    args->end = uvm_va_block_region_end(va_block, region) + 1;
    args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
@@ -2815,8 +2897,8 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
        // since migrate_vma_setup() would have reported that information.
        // Try to make it resident in system memory and retry the migration.
        status = hmm_make_resident_cpu(va_block,
-                                       service_context->block_context.hmm.vma,
-                                       service_context->block_context.hmm.src_pfns,
+                                       service_context->block_context->hmm.vma,
+                                       service_context->block_context->hmm.src_pfns,
                                       region,
                                       service_context->access_type,
                                       NULL);
@@ -2962,16 +3044,6 @@ static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migra
                                     &uvm_hmm_migrate_event->same_devmem_page_mask);
 }

-static bool is_resident(uvm_va_block_t *va_block,
-                        uvm_processor_id_t dest_id,
-                        uvm_va_block_region_t region)
-{
-    if (!uvm_processor_mask_test(&va_block->resident, dest_id))
-        return false;
-
-    return uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, dest_id), region);
-}
-
 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the
 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect.
 // TODO: Bug 3900785: investigate ways to implement async migration.
@@ -3063,9 +3135,7 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
        uvm_page_mask_init_from_region(page_mask, region, NULL);

        for_each_id_in_mask(id, &va_block->resident) {
-            if (!uvm_page_mask_andnot(page_mask,
-                                      page_mask,
-                                      uvm_va_block_resident_mask_get(va_block, id)))
+            if (!uvm_page_mask_andnot(page_mask, page_mask, uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE)))
                return NV_OK;
        }

@@ -3193,6 +3263,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
    uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
    const uvm_va_policy_t *policy;
    uvm_va_policy_node_t *node;
+    uvm_page_mask_t *cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
    unsigned long npages;
    NV_STATUS status;

@@ -3215,7 +3286,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
        // Pages resident on the GPU should not have a resident page in system
        // memory.
        // TODO: Bug 3660922: Need to handle read duplication at some point.
-        UVM_ASSERT(uvm_page_mask_region_empty(&va_block->cpu.resident, region));
+        UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region));

        status = alloc_and_copy_to_cpu(va_block,
                                       NULL,
@@ -3314,35 +3385,34 @@ NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
                                     NULL);
 }

-NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
+NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
 {
-    unsigned long src_pfn = 0;
-    unsigned long dst_pfn = 0;
-    struct page *dst_page;
    NV_STATUS status = NV_OK;
+    unsigned long src_pfn;
+    unsigned long dst_pfn;
+    struct migrate_vma args;
+    struct page *src_page = vmf->page;
+    uvm_tracker_t tracker = UVM_TRACKER_INIT();
    int ret;

-    ret = migrate_device_range(&src_pfn, pfn, 1);
-    if (ret)
-        return errno_to_nv_status(ret);
+    args.vma = vmf->vma;
+    args.src = &src_pfn;
+    args.dst = &dst_pfn;
+    args.start = nv_page_fault_va(vmf);
+    args.end = args.start + PAGE_SIZE;
+    args.pgmap_owner = &g_uvm_global;
+    args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
+    args.fault_page = src_page;
+
+    // We don't call migrate_vma_setup_locked() here because we don't
+    // have a va_block and don't want to ignore invalidations.
+    ret = migrate_vma_setup(&args);
+    UVM_ASSERT(!ret);

    if (src_pfn & MIGRATE_PFN_MIGRATE) {
-        // All the code for copying a vidmem page to sysmem relies on
-        // having a va_block. However certain combinations of mremap()
-        // and fork() can result in device-private pages being mapped
-        // in a child process without a va_block.
-        //
-        // We don't expect the above to be a common occurance so for
-        // now we allocate a fresh zero page when evicting without a
-        // va_block. However this results in child processes losing
-        // data so make sure we warn about it. Ideally we would just
-        // not migrate and SIGBUS the child if it tries to access the
-        // page. However that would prevent unloading of the driver so
-        // we're stuck with this until we fix the problem.
-        // TODO: Bug 3902536: add code to migrate GPU memory without having a
-        // va_block.
-        WARN_ON(1);
-        dst_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO);
+        struct page *dst_page;
+
+        dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
        if (!dst_page) {
            status = NV_ERR_NO_MEMORY;
            goto out;
@@ -3351,11 +3421,15 @@ NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
        lock_page(dst_page);
        dst_pfn = migrate_pfn(page_to_pfn(dst_page));

-        migrate_device_pages(&src_pfn, &dst_pfn, 1);
+        status = uvm_hmm_copy_devmem_page(dst_page, src_page, &tracker);
+        if (status == NV_OK)
+            status = uvm_tracker_wait_deinit(&tracker);
    }

+    migrate_vma_pages(&args);
+
 out:
-    migrate_device_finalize(&src_pfn, &dst_pfn, 1);
+    migrate_vma_finalize(&args);

    return status;
 }
@@ -3606,4 +3680,3 @@ bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
 }

 #endif // UVM_IS_CONFIG_HMM()
-
--- a/kernel-open/nvidia-uvm/uvm_hmm.h
+++ b/kernel-open/nvidia-uvm/uvm_hmm.h
@@ -307,10 +307,10 @@ typedef struct
                                     uvm_migrate_mode_t mode,
                                     uvm_tracker_t *out_tracker);

-    // Evicts all va_blocks in the va_space to the CPU. Unlike the
-    // other va_block eviction functions this is based on virtual
-    // address and therefore takes mmap_lock for read.
-    void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space);
+    // Handle a fault to a device-private page from a process other than the
+    // process which created the va_space that originally allocated the
+    // device-private page.
+    NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf);

    // This sets the va_block_context->hmm.src_pfns[] to the ZONE_DEVICE private
    // PFN for the GPU chunk memory.
@@ -343,14 +343,6 @@ typedef struct
                                                    const uvm_page_mask_t *pages_to_evict,
                                                    uvm_va_block_region_t region);

-    // Migrate a GPU device-private page to system memory. This is
-    // called to remove CPU page table references to device private
-    // struct pages for the given GPU after all other references in
-    // va_blocks have been released and the GPU is in the process of
-    // being removed/torn down. Note that there is no mm, VMA,
-    // va_block or any user channel activity on this GPU.
-    NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn);
-
    // This returns what would be the intersection of va_block start/end and
    // VMA start/end-1 for the given 'lookup_address' if
    // uvm_hmm_va_block_find_create() was called.
@@ -592,8 +584,10 @@ typedef struct
        return NV_ERR_INVALID_ADDRESS;
    }

-    static void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
+    static NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
    {
+        UVM_ASSERT(0);
+        return NV_ERR_INVALID_ADDRESS;
    }

    static NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
@@ -622,11 +616,6 @@ typedef struct
        return NV_OK;
    }

-    static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
-    {
-        return NV_OK;
-    }
-
    static NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
                                                   struct mm_struct *mm,
                                                   NvU64 lookup_address,
--- a/kernel-open/nvidia-uvm/uvm_hopper.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2022 NVIDIA Corporation
+    Copyright (c) 2020-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -59,12 +59,12 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)

    // Physical CE writes to vidmem are non-coherent with respect to the CPU on
    // GH180.
-    parent_gpu->ce_phys_vidmem_write_supported = !uvm_gpu_is_coherent(parent_gpu);
+    parent_gpu->ce_phys_vidmem_write_supported = !uvm_parent_gpu_is_coherent(parent_gpu);

    // TODO: Bug 4174553: [HGX-SkinnyJoe][GH180] channel errors discussion/debug
    //                    portion for the uvm tests became nonresponsive after
    //                    some time and then failed even after reboot
-    parent_gpu->peer_copy_mode = uvm_gpu_is_coherent(parent_gpu) ?
+    parent_gpu->peer_copy_mode = uvm_parent_gpu_is_coherent(parent_gpu) ?
                                                           UVM_GPU_PEER_COPY_MODE_VIRTUAL : g_uvm_global.peer_copy_mode;

    // All GR context buffers may be mapped to 57b wide VAs. All "compute" units
--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2023 NVIDIA Corporation
+    Copyright (c) 2020-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -368,10 +368,7 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_hopper(void *entry,
-                            uvm_mmu_page_table_alloc_t **phys_allocs,
-                            NvU32 depth,
-                            uvm_page_directory_t *child_dir)
+static void make_pde_hopper(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
 {
    NvU32 entry_count = entries_per_index_hopper(depth);
    NvU64 *entry_bits = (NvU64 *)entry;
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@@ -128,8 +128,9 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 // present if we see the callback.
 //
 // The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
-// v3.19 (2014-11-13).
-    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
+// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
+    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
+        defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
        #define UVM_CAN_USE_MMU_NOTIFIERS() 1
    #else
        #define UVM_CAN_USE_MMU_NOTIFIERS() 0
@@ -348,6 +349,47 @@ static inline NvU64 NV_GETTIME(void)
             (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
 #endif

+#if !defined(NV_FIND_NEXT_BIT_WRAP_PRESENT)
+    static inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset)
+    {
+        unsigned long bit = find_next_bit(addr, size, offset);
+
+        if (bit < size)
+            return bit;
+
+        bit = find_first_bit(addr, offset);
+        return bit < offset ? bit : size;
+    }
+#endif
+
+// for_each_set_bit_wrap and __for_each_wrap were introduced in v6.1-rc1
+// by commit 4fe49b3b97c2640147c46519c2a6fdb06df34f5f
+#if !defined(for_each_set_bit_wrap)
+static inline unsigned long __for_each_wrap(const unsigned long *bitmap,
+                                            unsigned long size,
+                                            unsigned long start,
+                                            unsigned long n)
+{
+    unsigned long bit;
+
+    if (n > start) {
+        bit = find_next_bit(bitmap, size, n);
+        if (bit < size)
+            return bit;
+
+        n = 0;
+    }
+
+    bit = find_next_bit(bitmap, start, n);
+    return bit < start ? bit : size;
+}
+
+#define for_each_set_bit_wrap(bit, addr, size, start)                   \
+    for ((bit) = find_next_bit_wrap((addr), (size), (start));           \
+         (bit) < (size);                                                \
+         (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
+#endif
+
 // Added in 2.6.24
 #ifndef ACCESS_ONCE
  #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
@@ -579,4 +621,5 @@ static inline pgprot_t uvm_pgprot_decrypted(pgprot_t prot)
  #include <asm/page.h>
  #define page_to_virt(x)    __va(PFN_PHYS(page_to_pfn(x)))
 #endif
+
 #endif // _UVM_LINUX_H
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@@ -355,6 +355,7 @@ static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_
    if (!ext_gpu_map->mem_handle)
        return UVM_MEMBAR_GPU;

+    // EGM uses the same barriers as sysmem.
    return uvm_hal_downgrade_membar_type(ext_gpu_map->gpu,
                                         !ext_gpu_map->is_sysmem && ext_gpu_map->gpu == ext_gpu_map->owning_gpu);
 }
@@ -633,6 +634,8 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
                                          const UvmGpuMemoryInfo *mem_info)
 {
    uvm_gpu_t *owning_gpu;
+    if (mem_info->egm)
+        UVM_ASSERT(mem_info->sysmem);

    if (!mem_info->deviceDescendant && !mem_info->sysmem) {
        ext_gpu_map->owning_gpu = NULL;
@@ -641,6 +644,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    }
    // This is a local or peer allocation, so the owning GPU must have been
    // registered.
+    // This also checks for if EGM owning GPU is registered.
    owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
    if (!owning_gpu)
        return NV_ERR_INVALID_DEVICE;
@@ -651,13 +655,10 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    // crashes when it's eventually freed.
    // TODO: Bug 1811006: Bug tracking the RM issue, its fix might change the
    // semantics of sysmem allocations.
-    if (mem_info->sysmem) {
-        ext_gpu_map->owning_gpu = owning_gpu;
-        ext_gpu_map->is_sysmem = true;
-        return NV_OK;
-    }

-    if (owning_gpu != mapping_gpu) {
+    // Check if peer access for peer memory is enabled.
+    // This path also handles EGM allocations.
+    if (owning_gpu != mapping_gpu && (!mem_info->sysmem || mem_info->egm)) {
        // TODO: Bug 1757136: In SLI, the returned UUID may be different but a
        //       local mapping must be used. We need to query SLI groups to know
        //       that.
@@ -666,7 +667,9 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    }

    ext_gpu_map->owning_gpu = owning_gpu;
-    ext_gpu_map->is_sysmem = false;
+    ext_gpu_map->is_sysmem = mem_info->sysmem;
+    ext_gpu_map->is_egm = mem_info->egm;
+
    return NV_OK;
 }

@@ -719,6 +722,7 @@ static NV_STATUS uvm_ext_gpu_map_split(uvm_range_tree_t *tree,
    new->gpu = existing_map->gpu;
    new->owning_gpu = existing_map->owning_gpu;
    new->is_sysmem = existing_map->is_sysmem;
+    new->is_egm = existing_map->is_egm;

    // Initialize the new ext_gpu_map tracker as a copy of the existing_map tracker.
    // This way, any operations on any of the two ext_gpu_maps will be able to
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -106,10 +106,7 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_maxwell(void *entry,
-                             uvm_mmu_page_table_alloc_t **phys_allocs,
-                             NvU32 depth,
-                             uvm_page_directory_t *child_dir)
+static void make_pde_maxwell(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
 {
    NvU64 pde_bits = 0;
    UVM_ASSERT(depth == 0);
--- a/kernel-open/nvidia-uvm/uvm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_mem.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -93,9 +93,8 @@ static bool sysmem_can_be_mapped_on_gpu(uvm_mem_t *sysmem)
 {
    UVM_ASSERT(uvm_mem_is_sysmem(sysmem));

-    // In Confidential Computing, only unprotected memory can be mapped on the
-    // GPU
-    if (g_uvm_global.conf_computing_enabled)
+    // If SEV is enabled, only unprotected memory can be mapped
+    if (g_uvm_global.sev_enabled)
        return uvm_mem_is_sysmem_dma(sysmem);

    return true;
@@ -738,7 +737,7 @@ static NV_STATUS mem_map_cpu_to_sysmem_kernel(uvm_mem_t *mem)
            pages[page_index] = mem_cpu_page(mem, page_index * PAGE_SIZE);
    }

-    if (g_uvm_global.conf_computing_enabled && uvm_mem_is_sysmem_dma(mem))
+    if (g_uvm_global.sev_enabled && uvm_mem_is_sysmem_dma(mem))
        prot = uvm_pgprot_decrypted(PAGE_KERNEL_NOENC);

    mem->kernel.cpu_addr = vmap(pages, num_pages, VM_MAP, prot);
--- a/kernel-open/nvidia-uvm/uvm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_mem_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -44,10 +44,10 @@ static NvU32 first_page_size(NvU32 page_sizes)

 static inline NV_STATUS __alloc_map_sysmem(NvU64 size, uvm_gpu_t *gpu, uvm_mem_t **sys_mem)
 {
-    if (g_uvm_global.conf_computing_enabled)
+    if (g_uvm_global.sev_enabled)
        return uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(size, gpu, current->mm, sys_mem);
-
-    return uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, sys_mem);
+    else
+        return uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, sys_mem);
 }

 static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
@@ -335,6 +335,9 @@ error:

 static bool should_test_page_size(size_t alloc_size, NvU32 page_size)
 {
+    if (g_uvm_global.sev_enabled)
+        return false;
+
    if (g_uvm_global.num_simulated_devices == 0)
        return true;

--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -130,9 +130,9 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;

-    // Save the mask of unmapped pages because it will change after the
+    // Get the mask of unmapped pages because it will change after the
    // first map operation
-    uvm_page_mask_complement(&va_block_context->caller_page_mask, &va_block->maybe_mapped_pages);
+    uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);

    if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
        // Do not map pages that are already resident on the CPU. This is in
@@ -147,7 +147,7 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
        // such pages at all, when migrating.
        uvm_page_mask_andnot(&va_block_context->caller_page_mask,
                             &va_block_context->caller_page_mask,
-                             uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU));
+                             uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
    }

    // Only map those pages that are not mapped anywhere else (likely due
@@ -377,7 +377,7 @@ static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block,

    mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU);
    if (uvm_processor_mask_test(&va_block->resident, dest_id)) {
-        const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id);
+        const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
        uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask;

        // TODO: Bug 1877578
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@@ -672,14 +672,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
        .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
    };

-    // WAR for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // This code path isn't used on GH180 but we need to maintain consistent
-    // behaviour on systems that do.
-    if (!vma_is_anonymous(args->vma))
-        return NV_WARN_NOTHING_TO_DO;
-
    ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
    if (ret < 0)
        return errno_to_nv_status(ret);
@@ -693,24 +685,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
    if (ret < 0)
        return errno_to_nv_status(ret);

-    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
-    //       support for it is added to the Linux kernel
-    //
-    // A side-effect of migrate_vma_setup() is it calls mmu notifiers even if a
-    // page can't be migrated (eg. because it's a non-anonymous mapping). We
-    // need this side-effect for SMMU on GH180 to ensure any cached read-only
-    // entries are flushed from SMMU on permission upgrade.
-    //
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // The above WAR doesn't work for HugeTLBfs mappings because
-    // migrate_vma_setup() will fail in that case.
-    if (!vma_is_anonymous(args->vma)) {
-        migrate_vma_finalize(args);
-        return NV_WARN_NOTHING_TO_DO;
-    }
-
    uvm_migrate_vma_alloc_and_copy(args, state);
    if (state->status == NV_OK) {
        migrate_vma_pages(args);
@@ -884,13 +858,9 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
    start = max(start, vma->vm_start);
    outer = min(outer, vma->vm_end);

-    // migrate_vma only supports anonymous VMAs. We check for those after
-    // calling migrate_vma_setup() to workaround Bug 4130089. We need to check
-    // for HugeTLB VMAs here because migrate_vma_setup() will return a fatal
-    // error for those.
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    if (is_vm_hugetlb_page(vma))
+    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
+    //       support for it is added to the Linux kernel
+    if (!vma_is_anonymous(vma))
        return NV_WARN_NOTHING_TO_DO;

    if (uvm_processor_mask_empty(&va_space->registered_gpus))
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@@ -51,7 +51,7 @@ typedef struct
 #if defined(CONFIG_MIGRATE_VMA_HELPER)
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #else
-#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
+#if defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_VMA_SETUP_PRESENT)
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #endif
 #endif
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -323,153 +323,37 @@ static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
    uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
 }

-static void pde_fill_cpu(uvm_page_tree_t *tree,
-                         uvm_page_directory_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr)
-{
-    NvU64 pde_data[2], entry_size;
-    NvU32 i;
-
-    UVM_ASSERT(uvm_mmu_use_cpu(tree));
-
-    entry_size = tree->hal->entry_size(directory->depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    for (i = 0; i < pde_count; i++) {
-        tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i]);
-
-        if (entry_size == sizeof(pde_data[0]))
-            uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
-        else
-            uvm_mmu_page_table_cpu_memset_16(tree->gpu, &directory->phys_alloc, start_index + i, pde_data, 1);
-    }
-}
-
-static void pde_fill_gpu(uvm_page_tree_t *tree,
-                         uvm_page_directory_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr,
-                         uvm_push_t *push)
-{
-    NvU64 pde_data[2], entry_size;
-    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->phys_alloc.addr);
-    NvU32 max_inline_entries;
-    uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
-    uvm_gpu_address_t inline_data_addr;
-    uvm_push_inline_data_t inline_data;
-    NvU32 entry_count, i, j;
-
-    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
-
-    entry_size = tree->hal->entry_size(directory->depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / entry_size;
-
-    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
-    else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
-
-    pde_entry_addr.address += start_index * entry_size;
-
-    for (i = 0; i < pde_count;) {
-        // All but the first memory operation can be pipelined. We respect the
-        // caller's pipelining settings for the first push.
-        if (i != 0)
-            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
-
-        entry_count = min(pde_count - i, max_inline_entries);
-
-        // No membar is needed until the last memory operation. Otherwise,
-        // use caller's membar flag.
-        if ((i + entry_count) < pde_count)
-            uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
-            uvm_push_set_flag(push, push_membar_flag);
-
-        uvm_push_inline_data_begin(push, &inline_data);
-        for (j = 0; j < entry_count; j++) {
-            tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i + j]);
-            uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
-        }
-        inline_data_addr = uvm_push_inline_data_end(&inline_data);
-
-        tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * entry_size);
-
-        i += entry_count;
-        pde_entry_addr.address += entry_size * entry_count;
-    }
-}
-
-// pde_fill() populates pde_count PDE entries (starting at start_index) with
-// the same mapping, i.e., with the same physical address (phys_addr).
-// pde_fill() is optimized for pde_count == 1, which is the common case. The
-// map_remap() function is the only case where pde_count > 1, only used on GA100
-// GPUs for 512MB page size mappings.
-static void pde_fill(uvm_page_tree_t *tree,
-                     uvm_page_directory_t *directory,
-                     NvU32 start_index,
-                     NvU32 pde_count,
-                     uvm_mmu_page_table_alloc_t **phys_addr,
-                     uvm_push_t *push)
-{
-    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, directory->depth, UVM_PAGE_SIZE_AGNOSTIC));
-
-    if (push)
-        pde_fill_gpu(tree, directory, start_index, pde_count, phys_addr, push);
-    else
-        pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
-}
-
 static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
-    NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
+    NvU64 clear_bits[2];
+    uvm_mmu_mode_hal_t *hal = tree->hal;

-    // Passing in NULL for the phys_allocs will mark the child entries as
-    // invalid.
-    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
-
-    // Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
-    // than 512 entries. We initialize them all with the same clean PDE.
-    // Additionally, only ATS systems may require clean PDEs bit settings based
-    // on the mapping VA.
-    if (dir->depth == tree->hal->page_table_depth(page_size) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
-        NvU64 clear_bits[2];
-
-        // If it is not a PTE, make a clean PDE.
-        if (dir->depth != tree->hal->page_table_depth(page_size)) {
-            tree->hal->make_pde(clear_bits, phys_allocs, dir->depth, dir->entries[0]);
-
-            // Make sure that using only clear_bits[0] will work.
-            UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
-        }
-        else {
-            *clear_bits = 0;
-        }
-
-        // Initialize the memory to a reasonable value.
-        if (push) {
-            tree->gpu->parent->ce_hal->memset_8(push,
-                                                uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
-                                                *clear_bits,
-                                                dir->phys_alloc.size);
-        }
-        else {
-            uvm_mmu_page_table_cpu_memset_8(tree->gpu,
-                                            &dir->phys_alloc,
-                                            0,
-                                            *clear_bits,
-                                            dir->phys_alloc.size / sizeof(*clear_bits));
-        }
+    if (dir->depth == tree->hal->page_table_depth(page_size)) {
+        *clear_bits = 0; // Invalid PTE
    }
    else {
-        pde_fill(tree, dir, 0, entries_count, phys_allocs, push);
+        // passing in NULL for the phys_allocs will mark the child entries as invalid
+        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
+        hal->make_pde(clear_bits, phys_allocs, dir->depth);
+
+        // Make sure that using only clear_bits[0] will work
+        UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
    }

+    // initialize the memory to a reasonable value
+    if (push) {
+        tree->gpu->parent->ce_hal->memset_8(push,
+                                            uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+                                            *clear_bits,
+                                            dir->phys_alloc.size);
+    }
+    else {
+        uvm_mmu_page_table_cpu_memset_8(tree->gpu,
+                                        &dir->phys_alloc,
+                                        0,
+                                        *clear_bits,
+                                        dir->phys_alloc.size / sizeof(*clear_bits));
+    }
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@@ -483,10 +367,8 @@ static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
    NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
    uvm_page_directory_t *dir;

-    // The page tree doesn't cache PTEs so space is not allocated for entries
-    // that are always PTEs.
-    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not
-    // page_size.
+    // The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
+    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
    if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
        entry_count = 0;
    else
@@ -527,6 +409,108 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }

+static void pde_fill_cpu(uvm_page_tree_t *tree,
+                         NvU32 depth,
+                         uvm_mmu_page_table_alloc_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr)
+{
+    NvU64 pde_data[2], entry_size;
+
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+    entry_size = tree->hal->entry_size(depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    tree->hal->make_pde(pde_data, phys_addr, depth);
+
+    if (entry_size == sizeof(pde_data[0]))
+        uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
+    else
+        uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
+}
+
+static void pde_fill_gpu(uvm_page_tree_t *tree,
+                         NvU32 depth,
+                         uvm_mmu_page_table_alloc_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr,
+                         uvm_push_t *push)
+{
+    NvU64 pde_data[2], entry_size;
+    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
+
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    tree->hal->make_pde(pde_data, phys_addr, depth);
+    pde_entry_addr.address += start_index * entry_size;
+
+    if (entry_size == sizeof(pde_data[0])) {
+        tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
+    }
+    else {
+        NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
+        uvm_gpu_address_t inline_data_addr;
+        uvm_push_inline_data_t inline_data;
+        uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
+        NvU32 i;
+
+        if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
+            push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
+        else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
+            push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
+
+        for (i = 0; i < pde_count;) {
+            NvU32 j;
+            NvU32 entry_count = min(pde_count - i, max_inline_entries);
+
+            uvm_push_inline_data_begin(push, &inline_data);
+            for (j = 0; j < entry_count; j++)
+                uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
+            inline_data_addr = uvm_push_inline_data_end(&inline_data);
+
+            // All but the first memcopy can be pipelined. We respect the
+            // caller's pipelining settings for the first push.
+            if (i != 0)
+                uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
+
+            // No membar is needed until the last copy. Otherwise, use
+            // caller's membar flag.
+            if (i + entry_count < pde_count)
+                uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+            else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
+                uvm_push_set_flag(push, push_membar_flag);
+
+            tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
+
+            i += entry_count;
+            pde_entry_addr.address += sizeof(pde_data) * entry_count;
+        }
+    }
+}
+
+// pde_fill() populates pde_count PDE entries (starting at start_index) with
+// the same mapping, i.e., with the same physical address (phys_addr).
+static void pde_fill(uvm_page_tree_t *tree,
+                     NvU32 depth,
+                     uvm_mmu_page_table_alloc_t *directory,
+                     NvU32 start_index,
+                     NvU32 pde_count,
+                     uvm_mmu_page_table_alloc_t **phys_addr,
+                     uvm_push_t *push)
+{
+    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
+
+    if (push)
+        pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
+    else
+        pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
+}
+
 static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
                                            uvm_page_directory_t *parent,
                                            NvU32 index_in_parent)
@@ -556,7 +540,7 @@ static void pde_write(uvm_page_tree_t *tree,
            phys_allocs[i] = &entry->phys_alloc;
    }

-    pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
+    pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
 }

 static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
@@ -829,11 +813,8 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm

 static void map_remap_deinit(uvm_page_tree_t *tree)
 {
-    if (tree->map_remap.pde0) {
-        phys_mem_deallocate(tree, &tree->map_remap.pde0->phys_alloc);
-        uvm_kvfree(tree->map_remap.pde0);
-        tree->map_remap.pde0 = NULL;
-    }
+    if (tree->map_remap.pde0.size)
+        phys_mem_deallocate(tree, &tree->map_remap.pde0);

    if (tree->map_remap.ptes_invalid_4k.size)
        phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
@@ -858,16 +839,10 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
    // PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
    // return the PTE for the get_ptes()'s caller.
    if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
-        tree->map_remap.pde0 = allocate_directory(tree,
-                                                  UVM_PAGE_SIZE_2M,
-                                                  tree->hal->page_table_depth(UVM_PAGE_SIZE_2M),
-                                                  UVM_PMM_ALLOC_FLAGS_EVICT);
-        if (tree->map_remap.pde0 == NULL) {
-            status = NV_ERR_NO_MEMORY;
+        status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
+        if (status != NV_OK)
            goto error;
-        }
    }
-
    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
    if (status != NV_OK)
        goto error;
@@ -889,23 +864,22 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
        NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
        size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
-        NvU32 pde0_entries = tree->map_remap.pde0->phys_alloc.size / tree->hal->entry_size(tree->map_remap.pde0->depth);
+
+        // pde0 depth equals UVM_PAGE_SIZE_2M.
+        NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
+        NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);

        // The big-page entry is NULL which makes it an invalid entry.
        phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;

        // By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
        // sufficient when pde0 is allocated in VIDMEM.
-        if (tree->map_remap.pde0->phys_alloc.addr.aperture == UVM_APERTURE_VID)
+        if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
            uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

-        // This is an orphan directory, make_pde() requires a directory to
-        // compute the VA. The UVM depth map_remap() operates on is not in the
-        // range make_pde() must operate. We only need to supply the fields used
-        // by make_pde() to not access invalid memory addresses.
-
        pde_fill(tree,
-                 tree->map_remap.pde0,
+                 pde0_depth,
+                 &tree->map_remap.pde0,
                 0,
                 pde0_entries,
                 (uvm_mmu_page_table_alloc_t **)&phys_allocs,
@@ -932,10 +906,11 @@ error:
 // --------------|-------------------------||----------------|----------------
 //    vidmem     |           -             ||    vidmem      |      false
 //    sysmem     |           -             ||    sysmem      |      false
-//    default    |        <not set>        ||    vidmem      |      true
+//    default    |        <not set>        ||    vidmem      |      true (1)
 //    default    |         vidmem          ||    vidmem      |      false
 //    default    |         sysmem          ||    sysmem      |      false
 //
+// (1) When SEV mode is enabled, the fallback path is disabled.
 //
 // In SR-IOV heavy the the page tree must be in vidmem, to prevent guest drivers
 // from updating GPU page tables without hypervisor knowledge.
@@ -951,27 +926,28 @@ error:
 //
 static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t location)
 {
+    bool should_location_be_vidmem;
    UVM_ASSERT(tree->gpu != NULL);
    UVM_ASSERT_MSG((location == UVM_APERTURE_VID) ||
                   (location == UVM_APERTURE_SYS) ||
                   (location == UVM_APERTURE_DEFAULT),
                   "Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);

-    // The page tree of a "fake" GPU used during page tree testing can be in
-    // sysmem in scenarios where a "real" GPU must be in vidmem. Fake GPUs can
-    // be identified by having no channel manager.
-    if (tree->gpu->channel_manager != NULL) {
+    should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu)
+                                || uvm_conf_computing_mode_enabled(tree->gpu);

-        if (uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu))
-            UVM_ASSERT(location == UVM_APERTURE_VID);
-        else if (uvm_conf_computing_mode_enabled(tree->gpu))
-            UVM_ASSERT(location == UVM_APERTURE_VID);
-    }
+    // The page tree of a "fake" GPU used during page tree testing can be in
+    // sysmem even if should_location_be_vidmem is true. A fake GPU can be
+    // identified by having no channel manager.
+    if ((tree->gpu->channel_manager != NULL) && should_location_be_vidmem)
+        UVM_ASSERT(location == UVM_APERTURE_VID);

    if (location == UVM_APERTURE_DEFAULT) {
        if (page_table_aperture == UVM_APERTURE_DEFAULT) {
            tree->location = UVM_APERTURE_VID;
-            tree->location_sys_fallback = true;
+
+            // See the comment (1) above.
+            tree->location_sys_fallback = !g_uvm_global.sev_enabled;
        }
        else {
            tree->location = page_table_aperture;
@@ -1358,9 +1334,10 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
    if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

-    phys_alloc[0] = &tree->map_remap.pde0->phys_alloc;
+    phys_alloc[0] = &tree->map_remap.pde0;
    pde_fill(tree,
-             range->table,
+             range->table->depth,
+             &range->table->phys_alloc,
             range->start_index,
             range->entry_count,
             (uvm_mmu_page_table_alloc_t **)&phys_alloc,
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -219,7 +219,7 @@ struct uvm_mmu_mode_hal_struct
    // point to two items for dual PDEs).
    // any of allocs are allowed to be NULL, in which case they are to be
    // treated as empty.
-    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth, uvm_page_directory_t *child_dir);
+    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth);

    // size of an entry in a directory/table.  Generally either 8 or 16 bytes.
    // (in the case of Pascal dual PDEs)
@@ -229,7 +229,7 @@ struct uvm_mmu_mode_hal_struct
    NvU32 (*entries_per_index)(NvU32 depth);

    // For dual PDEs, this is ether 1 or 0, depending on the page size.
-    // This is used to index the host copy only. GPU PDEs are always entirely
+    // This is used to index the host copy only.  GPU PDEs are always entirely
    // re-written using make_pde.
    NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);

@@ -295,8 +295,9 @@ struct uvm_page_tree_struct

        // PDE0 where all big-page entries are invalid, and small-page entries
        // point to ptes_invalid_4k.
-        // pde0 is used on Pascal+ GPUs, i.e., they have the same PDE format.
-        uvm_page_directory_t *pde0;
+        // pde0 is only used on Pascal-Ampere, i.e., they have the same PDE
+        // format.
+        uvm_mmu_page_table_alloc_t pde0;
    } map_remap;

    // Tracker for all GPU operations on the tree
@@ -364,32 +365,21 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 // the same page size without an intervening put_ptes. To duplicate a subset of
 // an existing range or change the size of an existing range, use
 // uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
-NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
-                                 NvU64 start,
-                                 NvLength size,
-                                 uvm_pmm_alloc_flags_t pmm_flags,
-                                 uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
+        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);

 // Same as uvm_page_tree_get_ptes(), but doesn't synchronize the GPU work.
 //
 // All pending operations can be waited on with uvm_page_tree_wait().
-NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
-                                       NvU32 page_size,
-                                       NvU64 start,
-                                       NvLength size,
-                                       uvm_pmm_alloc_flags_t pmm_flags,
-                                       uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
+        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);

 // Returns a single-entry page table range for the addresses passed.
 // The size parameter must be a page size supported by this tree.
 // This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
 // page_size.
-NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
-                                  NvU32 page_size,
-                                  NvU64 start,
-                                  uvm_pmm_alloc_flags_t pmm_flags,
-                                  uvm_page_table_range_t *single);
+NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start,
+        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *single);

 // For a single-entry page table range, write the PDE (which could be a dual
 // PDE) to the GPU.
@@ -488,8 +478,8 @@ NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
 // new_range_vec will contain the upper portion of range_vec, starting at
 // new_end + 1.
 //
-// new_end + 1 is required to be within the address range of range_vec and be
-// aligned to range_vec's page_size.
+// new_end + 1 is required to be within the address range of range_vec and be aligned to
+// range_vec's page_size.
 //
 // On failure, the original range vector is left unmodified.
 NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
@@ -511,22 +501,18 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec);
 // for each offset.
 // The caller_data pointer is what the caller passed in as caller_data to
 // uvm_page_table_range_vec_write_ptes().
-typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec,
-                                                  NvU64 offset,
-                                                  void *caller_data);
+typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec, NvU64 offset,
+        void *caller_data);

-// Write all PTEs covered by the range vector using the given PTE making
-// function.
+// Write all PTEs covered by the range vector using the given PTE making function.
 //
 // After writing all the PTEs a TLB invalidate operation is performed including
 // the passed in tlb_membar.
 //
 // See comments about uvm_page_table_range_pte_maker_t for details about the
 // PTE making callback.
-NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
-                                              uvm_membar_t tlb_membar,
-                                              uvm_page_table_range_pte_maker_t pte_maker,
-                                              void *caller_data);
+NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar,
+        uvm_page_table_range_pte_maker_t pte_maker, void *caller_data);

 // Set all PTEs covered by the range vector to an empty PTE
 //
@@ -650,9 +636,8 @@ static NvU64 uvm_page_table_range_size(uvm_page_table_range_t *range)

 // Get the physical address of the entry at entry_index within the range
 // (counted from range->start_index).
-static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree,
-                                                                 uvm_page_table_range_t *range,
-                                                                 size_t entry_index)
+static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree, uvm_page_table_range_t *range,
+        size_t entry_index)
 {
    NvU32 entry_size = uvm_mmu_pte_size(tree, range->page_size);
    uvm_gpu_phys_address_t entry = range->table->phys_alloc.addr;
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -146,15 +146,9 @@ static void fake_tlb_invals_disable(void)
    g_fake_tlb_invals_tracking_enabled = false;
 }

-// Fake TLB invalidate VA that just saves off the parameters so that they can be
-// verified later.
-static void fake_tlb_invalidate_va(uvm_push_t *push,
-                                   uvm_gpu_phys_address_t pdb,
-                                   NvU32 depth,
-                                   NvU64 base,
-                                   NvU64 size,
-                                   NvU32 page_size,
-                                   uvm_membar_t membar)
+// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
+static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
+        NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
        return;
@@ -216,8 +210,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
    }
    if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
        UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
-                       expected_membar ? "a" : "no",
-                       uvm_membar_string(g_last_fake_inval->membar));
+                expected_membar ? "a" : "no",
+                uvm_membar_string(g_last_fake_inval->membar));
        result = false;
    }

@@ -236,8 +230,7 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
    }
    if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
-                       g_last_fake_inval->base,
-                       g_last_fake_inval->base + g_last_fake_inval->size);
+                g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
        return false;
    }
    if (g_last_fake_inval->depth != expected_depth) {
@@ -254,16 +247,15 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
-        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
+        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
+                base, base + size);
        return false;
    }

    if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
-                        base,
-                        base + size,
-                        inval->base,
-                        inval->base + inval->size);
+                base, base + size,
+                inval->base, inval->base + inval->size);
        return false;
    }
    if (inval->depth != expected_depth) {
@@ -278,13 +270,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    return true;
 }

-static bool assert_invalidate_range(NvU64 base,
-                                    NvU64 size,
-                                    NvU32 page_size,
-                                    bool allow_inval_all,
-                                    NvU32 range_depth,
-                                    NvU32 all_depth,
-                                    bool expected_membar)
+static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
 {
    NvU32 i;

@@ -502,6 +488,7 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
    return NV_OK;
 }

+
 static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -855,7 +842,6 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
    TEST_CHECK_RET(range2.entry_count == 256);
    TEST_CHECK_RET(range2.table->ref_count == 512);
    TEST_CHECK_RET(range1.table == range2.table);
-
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range1.start_index == 256);
@@ -885,7 +871,6 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
    TEST_CHECK_RET(range64k.entry_count == 16);
    TEST_CHECK_RET(range64k.table->ref_count == 16);
-
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range64k.start_index == 16);
@@ -1045,13 +1030,10 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)

    // Depth 4
    NvU64 extent_pte = UVM_PAGE_SIZE_2M;
-
    // Depth 3
    NvU64 extent_pde0 = extent_pte * (1ull << 8);
-
    // Depth 2
    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
-
    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

@@ -1099,11 +1081,7 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
-                                                 NvU64 base,
-                                                 NvU64 size,
-                                                 NvU32 min_page_size,
-                                                 NvU32 max_page_size)
+static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_push_t push;
@@ -1227,11 +1205,7 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
            NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
            if (*pte != expected_pte) {
                UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
-                               *pte,
-                               expected_pte,
-                               offset,
-                               range_vec->start,
-                               range_vec->size);
+                        *pte, expected_pte, offset, range_vec->start, range_vec->size);
                return false;
            }
            offset += range_vec->page_size;
@@ -1252,11 +1226,7 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
    TEST_CHECK_RET(data.status == NV_OK);
    TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
    TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
-                                                    range_vec->start,
-                                                    range_vec->size,
-                                                    range_vec->page_size,
-                                                    page_table_depth,
-                                                    membar != UVM_MEMBAR_NONE));
+            range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));

    fake_tlb_invals_disable();
@@ -1279,11 +1249,7 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
    return NV_OK;
 }

-static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
-                                       NvU64 start,
-                                       NvU64 size,
-                                       NvU32 page_size,
-                                       uvm_page_table_range_vec_t **range_vec_out)
+static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@@ -1586,17 +1552,17 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)

        memset(phys_allocs, 0, sizeof(phys_allocs));

-        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
+        hal->make_pde(&pde_bits, phys_allocs, 0);
        TEST_CHECK_RET(pde_bits == 0x0L);

        phys_allocs[0] = &alloc_sys;
        phys_allocs[1] = &alloc_vid;
-        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
+        hal->make_pde(&pde_bits, phys_allocs, 0);
        TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);

        phys_allocs[0] = &alloc_vid;
        phys_allocs[1] = &alloc_sys;
-        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
+        hal->make_pde(&pde_bits, phys_allocs, 0);
        TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);

        for (j = 0; j <= 2; j++) {
@@ -1666,7 +1632,6 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
-
    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);
@@ -1674,31 +1639,31 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@@ -1762,36 +1727,36 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // NO_ATS PDE1 (depth 2)
    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 2, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 2);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
    else
@@ -1840,32 +1805,32 @@ static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    // Cleared PDEs work as expected for big and small PDEs.
    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 4);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs, uncached ATS allowed.
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);

    // Dual PDEs, uncached.
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 4);
    TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 4);
    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
@@ -2338,8 +2303,7 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    gpu->parent = parent_gpu;

    // At least test_tlb_invalidates() relies on global state
-    // (g_tlb_invalidate_*) so make sure only one test instance can run at a
-    // time.
+    // (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
    uvm_mutex_lock(&g_uvm_global.global_lock);

    // Allocate the fake TLB tracking state. Notably tests still need to enable
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2020 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -140,10 +140,7 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_pascal(void *entry,
-                            uvm_mmu_page_table_alloc_t **phys_allocs,
-                            NvU32 depth,
-                            uvm_page_directory_t *child_dir)
+static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
 {
    NvU32 entry_count = entries_per_index_pascal(depth);
    NvU64 *entry_bits = (NvU64 *)entry;
--- a/kernel-open/nvidia-uvm/uvm_perf_events_test.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_events_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -22,10 +22,7 @@
 *******************************************************************************/

 #include "uvm_perf_events.h"
-#include "uvm_va_block.h"
-#include "uvm_va_range.h"
 #include "uvm_va_space.h"
-#include "uvm_kvmalloc.h"
 #include "uvm_test.h"

 // Global variable used to check that callbacks are correctly executed
@@ -46,10 +43,7 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
    NV_STATUS status;
    uvm_perf_event_data_t event_data;

-    uvm_va_block_t block;
-
    test_data = 0;
-
    memset(&event_data, 0, sizeof(event_data));

    // Use CPU id to avoid triggering the GPU stats update code
@@ -58,6 +52,7 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
    // Register a callback for page fault
    status = uvm_perf_register_event_callback(&va_space->perf_events, UVM_PERF_EVENT_FAULT, callback_inc_1);
    TEST_CHECK_GOTO(status == NV_OK, done);
+
    // Register a callback for page fault
    status = uvm_perf_register_event_callback(&va_space->perf_events, UVM_PERF_EVENT_FAULT, callback_inc_2);
    TEST_CHECK_GOTO(status == NV_OK, done);
@@ -65,13 +60,14 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
    // va_space read lock is required for page fault event notification
    uvm_va_space_down_read(va_space);

-    // Notify (fake) page fault. The two registered callbacks for this event increment the value of test_value
-    event_data.fault.block = &block;
+    // Notify (fake) page fault. The two registered callbacks for this event
+    // increment the value of test_value
    uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_FAULT, &event_data);

    uvm_va_space_up_read(va_space);

-    // test_data was initialized to zero. It should have been incremented by 1 and 2, respectively in the callbacks
+    // test_data was initialized to zero. It should have been incremented by 1
+    // and 2, respectively in the callbacks
    TEST_CHECK_GOTO(test_data == 3, done);

 done:
@@ -96,4 +92,3 @@ NV_STATUS uvm_test_perf_events_sanity(UVM_TEST_PERF_EVENTS_SANITY_PARAMS *params
 done:
    return status;
 }
-
--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
@@ -355,7 +355,7 @@ static NvU32 uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_blo
    uvm_page_mask_zero(prefetch_pages);

    if (UVM_ID_IS_CPU(new_residency) || va_block->gpus[uvm_id_gpu_index(new_residency)] != NULL)
-        resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency);
+        resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency, NUMA_NO_NODE);

    // If this is a first-touch fault and the destination processor is the
    // preferred location, populate the whole max_prefetch_region.
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@@ -164,7 +164,7 @@ typedef struct

        uvm_spinlock_t                          lock;

-        uvm_va_block_context_t      va_block_context;
+        uvm_va_block_context_t      *va_block_context;

        // Flag used to avoid scheduling delayed unpinning operations after
        // uvm_perf_thrashing_stop has been called.
@@ -601,6 +601,14 @@ static va_space_thrashing_info_t *va_space_thrashing_info_create(uvm_va_space_t

    va_space_thrashing = uvm_kvmalloc_zero(sizeof(*va_space_thrashing));
    if (va_space_thrashing) {
+        uvm_va_block_context_t *block_context = uvm_va_block_context_alloc(NULL);
+
+        if (!block_context) {
+            uvm_kvfree(va_space_thrashing);
+            return NULL;
+        }
+
+        va_space_thrashing->pinned_pages.va_block_context = block_context;
        va_space_thrashing->va_space = va_space;

        va_space_thrashing_info_init_params(va_space_thrashing);
@@ -621,6 +629,7 @@ static void va_space_thrashing_info_destroy(uvm_va_space_t *va_space)

    if (va_space_thrashing) {
        uvm_perf_module_type_unset_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
+        uvm_va_block_context_free(va_space_thrashing->pinned_pages.va_block_context);
        uvm_kvfree(va_space_thrashing);
    }
 }
@@ -1104,7 +1113,7 @@ static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
                   !uvm_processor_mask_test(&policy->accessed_by, processor_id));

        if (uvm_processor_mask_test(&va_block->resident, processor_id)) {
-            const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id);
+            const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);

            if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask,
                                      &block_thrashing->pinned_pages.mask,
@@ -1312,9 +1321,8 @@ void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_

        if (block_thrashing->last_time_stamp == 0 ||
            uvm_id_equal(block_thrashing->last_processor, processor_id) ||
-            time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns) {
+            time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns)
            goto done;
-        }

        num_block_pages = uvm_va_block_size(va_block) / PAGE_SIZE;

@@ -1803,7 +1811,7 @@ static void thrashing_unpin_pages(struct work_struct *work)
    struct delayed_work *dwork = to_delayed_work(work);
    va_space_thrashing_info_t *va_space_thrashing = container_of(dwork, va_space_thrashing_info_t, pinned_pages.dwork);
    uvm_va_space_t *va_space = va_space_thrashing->va_space;
-    uvm_va_block_context_t *va_block_context = &va_space_thrashing->pinned_pages.va_block_context;
+    uvm_va_block_context_t *va_block_context = va_space_thrashing->pinned_pages.va_block_context;

    // Take the VA space lock so that VA blocks don't go away during this
    // operation.
@@ -1937,7 +1945,6 @@ void uvm_perf_thrashing_unload(uvm_va_space_t *va_space)

    // Make sure that there are not pending work items
    if (va_space_thrashing) {
-        UVM_ASSERT(va_space_thrashing->pinned_pages.in_va_space_teardown);
        UVM_ASSERT(list_empty(&va_space_thrashing->pinned_pages.list));

        va_space_thrashing_info_destroy(va_space);
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -3377,76 +3377,47 @@ uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
    return gpu->id;
 }

-static void evict_orphan_pages(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
-{
-    NvU32 i;
-
-    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
-    UVM_ASSERT(chunk->suballoc);
-
-    for (i = 0; i < num_subchunks(chunk); i++) {
-        uvm_gpu_chunk_t *subchunk = chunk->suballoc->subchunks[i];
-
-        uvm_spin_lock(&pmm->list_lock);
-
-        if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) {
-            uvm_spin_unlock(&pmm->list_lock);
-
-            evict_orphan_pages(pmm, subchunk);
-            continue;
-        }
-
-        if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED && subchunk->is_referenced) {
-            unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(pmm, subchunk);
-
-            // TODO: Bug 3368756: add support for large GPU pages.
-            UVM_ASSERT(uvm_gpu_chunk_get_size(subchunk) == PAGE_SIZE);
-            uvm_spin_unlock(&pmm->list_lock);
-
-            // The above check for subchunk state is racy because the
-            // chunk may be freed after the lock is dropped. It is
-            // still safe to proceed in that case because the struct
-            // page reference will have dropped to zero and cannot
-            // have been re-allocated as this is only called during
-            // GPU teardown. Therefore migrate_device_range() will
-            // simply fail.
-            uvm_hmm_pmm_gpu_evict_pfn(pfn);
-            continue;
-        }
-
-        uvm_spin_unlock(&pmm->list_lock);
-    }
-}
-
-// Free any orphan pages.
-// This should be called as part of removing a GPU: after all work is stopped
-// and all va_blocks have been destroyed. There normally won't be any
-// device private struct page references left but there can be cases after
-// fork() where a child process still holds a reference. This function searches
-// for pages that still have a reference and migrates the page to the GPU in
-// order to release the reference in the CPU page table.
-static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
+// Check there are no orphan pages. This should be only called as part of
+// removing a GPU: after all work is stopped and all va_blocks have been
+// destroyed. By now there should be no device-private page references left as
+// there are no va_space's left on this GPU and orphan pages should be removed
+// by va_space destruction or unregistration from the GPU.
+static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
 {
    size_t i;
+    bool ret = true;
+    unsigned long pfn;
+    struct range range = pmm->devmem.pagemap.range;

-    if (!pmm->initialized)
-        return;
-
-    // This is only safe to call during GPU teardown where chunks
-    // cannot be re-allocated.
-    UVM_ASSERT(uvm_gpu_retained_count(uvm_pmm_to_gpu(pmm)) == 0);
+    if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
+        return ret;

    // Scan all the root chunks looking for subchunks which are still
-    // referenced. This is slow, but we only do this when unregistering a GPU
-    // and is not critical for performance.
+    // referenced.
    for (i = 0; i < pmm->root_chunks.count; i++) {
        uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];

        root_chunk_lock(pmm, root_chunk);
        if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
-            evict_orphan_pages(pmm, &root_chunk->chunk);
+            ret = false;
        root_chunk_unlock(pmm, root_chunk);
    }
+
+    for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+        struct page *page = pfn_to_page(pfn);
+
+        if (!is_device_private_page(page)) {
+            ret = false;
+            break;
+        }
+
+        if (page_count(page)) {
+            ret = false;
+            break;
+        }
+    }
+
+    return ret;
 }

 static void devmem_page_free(struct page *page)
@@ -3479,7 +3450,7 @@ static vm_fault_t devmem_fault(struct vm_fault *vmf)
 {
    uvm_va_space_t *va_space = vmf->page->zone_device_data;

-    if (!va_space || va_space->va_space_mm.mm != vmf->vma->vm_mm)
+    if (!va_space)
        return VM_FAULT_SIGBUS;

    return uvm_va_space_cpu_fault_hmm(va_space, vmf->vma, vmf);
@@ -3568,8 +3539,9 @@ static void devmem_deinit(uvm_pmm_gpu_t *pmm)
 {
 }

-static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
+static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
 {
+    return true;
 }
 #endif // UVM_IS_CONFIG_HMM()

@@ -3744,7 +3716,7 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)

    gpu = uvm_pmm_to_gpu(pmm);

-    uvm_pmm_gpu_free_orphan_pages(pmm);
+    UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
    nv_kthread_q_flush(&gpu->parent->lazy_free_q);
    UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
    release_free_root_chunks(pmm);
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@@ -749,6 +749,7 @@ NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 }

 static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,
+                                             int nid,
                                             uvm_cpu_chunk_alloc_flags_t alloc_flags)
 {
    gfp_t kernel_alloc_flags;
@@ -764,18 +765,27 @@ static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,

    kernel_alloc_flags |= GFP_HIGHUSER;

-    // For allocation sizes higher than PAGE_SIZE, use __GFP_NORETRY in
-    // order to avoid higher allocation latency from the kernel compacting
-    // memory to satisfy the request.
+    // For allocation sizes higher than PAGE_SIZE, use __GFP_NORETRY in order
+    // to avoid higher allocation latency from the kernel compacting memory to
+    // satisfy the request.
+    // Use __GFP_NOWARN to avoid printing allocation failure to the kernel log.
+    // High order allocation failures are handled gracefully by the caller.
    if (alloc_size > PAGE_SIZE)
-        kernel_alloc_flags |= __GFP_COMP | __GFP_NORETRY;
+        kernel_alloc_flags |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;

    if (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO)
        kernel_alloc_flags |= __GFP_ZERO;

-    page = alloc_pages(kernel_alloc_flags, get_order(alloc_size));
-    if (page && (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO))
-        SetPageDirty(page);
+    UVM_ASSERT(nid < num_online_nodes());
+    if (nid == NUMA_NO_NODE)
+        page = alloc_pages(kernel_alloc_flags, get_order(alloc_size));
+    else
+        page = alloc_pages_node(nid, kernel_alloc_flags, get_order(alloc_size));
+
+    if (page) {
+        if (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO)
+            SetPageDirty(page);
+    }

    return page;
 }
@@ -805,6 +815,7 @@ static uvm_cpu_physical_chunk_t *uvm_cpu_chunk_create(uvm_chunk_size_t alloc_siz

 NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
                              uvm_cpu_chunk_alloc_flags_t alloc_flags,
+                              int nid,
                              uvm_cpu_chunk_t **new_chunk)
 {
    uvm_cpu_physical_chunk_t *chunk;
@@ -812,7 +823,7 @@ NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,

    UVM_ASSERT(new_chunk);

-    page = uvm_cpu_chunk_alloc_page(alloc_size, alloc_flags);
+    page = uvm_cpu_chunk_alloc_page(alloc_size, nid, alloc_flags);
    if (!page)
        return NV_ERR_NO_MEMORY;

@@ -847,6 +858,13 @@ NV_STATUS uvm_cpu_chunk_alloc_hmm(struct page *page,
    return NV_OK;
 }

+int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk)
+{
+    UVM_ASSERT(chunk);
+    UVM_ASSERT(chunk->page);
+    return page_to_nid(chunk->page);
+}
+
 NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks)
 {
    NV_STATUS status = NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
@@ -304,11 +304,24 @@ uvm_chunk_sizes_mask_t uvm_cpu_chunk_get_allocation_sizes(void);

 // Allocate a physical CPU chunk of the specified size.
 //
+// The nid argument is used to indicate a memory node preference. If the
+// value is a memory node ID, the chunk allocation will be attempted on
+// that memory node. If the chunk cannot be allocated on that memory node,
+// it will be allocated on any memory node allowed by the process's policy.
+//
+// If the value of nid is a memory node ID that is not in the set of
+// current process's allowed memory nodes, it will be allocated on one of the
+// nodes in the allowed set.
+//
+// If the value of nid is NUMA_NO_NODE, the chunk will be allocated from any
+// of the allowed memory nodes by the process policy.
+//
 // If a CPU chunk allocation succeeds, NV_OK is returned. new_chunk will be set
 // to point to the newly allocated chunk. On failure, NV_ERR_NO_MEMORY is
 // returned.
 NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
                              uvm_cpu_chunk_alloc_flags_t flags,
+                              int nid,
                              uvm_cpu_chunk_t **new_chunk);

 // Allocate a HMM CPU chunk.
@@ -375,6 +388,9 @@ static uvm_cpu_logical_chunk_t *uvm_cpu_chunk_to_logical(uvm_cpu_chunk_t *chunk)
    return container_of((chunk), uvm_cpu_logical_chunk_t, common);
 }

+// Return the NUMA node ID of the physical page backing the chunk.
+int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk);
+
 // Free a CPU chunk.
 // This may not result in the immediate freeing of the physical pages of the
 // chunk if this is a logical chunk and there are other logical chunks holding
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2019 NVIDIA Corporation
+    Copyright (c) 2017-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -664,6 +664,7 @@ done:

 static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
                                      uvm_cpu_chunk_alloc_flags_t flags,
+                                      int nid,
                                      uvm_cpu_chunk_t **out_chunk)
 {
    uvm_cpu_chunk_t *chunk;
@@ -675,7 +676,7 @@ static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
    // It is possible that the allocation fails due to lack of large pages
    // rather than an API issue, which will result in a false negative.
    // However, that should be very rare.
-    TEST_NV_CHECK_RET(uvm_cpu_chunk_alloc(size, flags, &chunk));
+    TEST_NV_CHECK_RET(uvm_cpu_chunk_alloc(size, flags, nid, &chunk));

    // Check general state of the chunk:
    //   - chunk should be a physical chunk,
@@ -685,6 +686,12 @@ static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(chunk) == size, done);
    TEST_CHECK_GOTO(uvm_cpu_chunk_num_pages(chunk) == size / PAGE_SIZE, done);

+    // It is possible for the kernel to allocate a chunk on a NUMA node other
+    // than the one requested. However, that should not be an issue with
+    // sufficient memory on each NUMA node.
+    if (nid != NUMA_NO_NODE)
+        TEST_CHECK_GOTO(uvm_cpu_chunk_get_numa_node(chunk) == nid, done);
+
    if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO) {
        NvU64 *cpu_addr;

@@ -719,7 +726,7 @@ static NV_STATUS test_cpu_chunk_mapping_basic_verify(uvm_gpu_t *gpu,
    NvU64 dma_addr;
    NV_STATUS status = NV_OK;

-    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, flags, &chunk));
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, flags, NUMA_NO_NODE, &chunk));
    phys_chunk = uvm_cpu_chunk_to_physical(chunk);

    // Check state of the physical chunk:
@@ -763,27 +770,27 @@ static NV_STATUS test_cpu_chunk_mapping_basic(uvm_gpu_t *gpu, uvm_cpu_chunk_allo
    return NV_OK;
 }

-static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2, uvm_gpu_t *gpu3)
+static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_chunk_t *chunk;
    uvm_cpu_physical_chunk_t *phys_chunk;
-    NvU64 dma_addr_gpu2;
+    NvU64 dma_addr_gpu1;

-    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
    phys_chunk = uvm_cpu_chunk_to_physical(chunk);

-    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
-    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu3), done);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu3), done);
-    dma_addr_gpu2 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent);
-    uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu3->parent);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
+    dma_addr_gpu1 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1->parent);
+    uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu2->parent);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);

    // DMA mapping addresses for different GPUs live in different IOMMU spaces,
    // so it would be perfectly legal for them to have the same IOVA, and even
@@ -793,7 +800,7 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2,
    // GPU1. It's true that we may get a false negative if both addresses
    // happened to alias and we had a bug in how the addresses are shifted in
    // the dense array, but that's better than intermittent failure.
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent) == dma_addr_gpu2, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1->parent) == dma_addr_gpu1, done);

 done:
    uvm_cpu_chunk_free(chunk);
@@ -911,7 +918,7 @@ static NV_STATUS test_cpu_chunk_split_and_merge(uvm_gpu_t *gpu)
        uvm_cpu_chunk_t *chunk;
        NV_STATUS status;

-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
        status = do_test_cpu_chunk_split_and_merge(chunk, gpu);
        uvm_cpu_chunk_free(chunk);

@@ -993,7 +1000,7 @@ static NV_STATUS test_cpu_chunk_dirty(uvm_gpu_t *gpu)
        uvm_cpu_physical_chunk_t *phys_chunk;
        size_t num_pages;

-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
        num_pages = uvm_cpu_chunk_num_pages(chunk);

@@ -1005,7 +1012,7 @@ static NV_STATUS test_cpu_chunk_dirty(uvm_gpu_t *gpu)

        uvm_cpu_chunk_free(chunk);

-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO, NUMA_NO_NODE, &chunk));
        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
        num_pages = uvm_cpu_chunk_num_pages(chunk);

@@ -1170,13 +1177,35 @@ NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space, uvm_processor_mask_t *te
    size_t size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);

    for_each_chunk_size_from(size, alloc_sizes) {
-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
        TEST_NV_CHECK_RET(do_test_cpu_chunk_free(chunk, va_space, test_gpus));
    }

    return NV_OK;
 }

+static NV_STATUS test_cpu_chunk_numa_alloc(uvm_va_space_t *va_space)
+{
+    uvm_cpu_chunk_t *chunk;
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t size;
+
+    for_each_chunk_size(size, alloc_sizes) {
+        int nid;
+
+        for_each_possible_uvm_node(nid) {
+            // Do not test CPU allocation on nodes that have no memory or CPU
+            if (!node_state(nid, N_MEMORY) || !node_state(nid, N_CPU))
+                continue;
+
+            TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, nid, &chunk));
+            uvm_cpu_chunk_free(chunk);
+        }
+    }
+
+    return NV_OK;
+}
+
 NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@@ -1197,6 +1226,7 @@ NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct f
    }

    TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, &test_gpus), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_numa_alloc(va_space), done);

    if (uvm_processor_mask_get_gpu_count(&test_gpus) >= 3) {
        uvm_gpu_t *gpu2, *gpu3;
--- a/kernel-open/nvidia-uvm/uvm_pmm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -324,7 +324,7 @@ static NV_STATUS gpu_mem_check(uvm_gpu_t *gpu,

    // TODO: Bug 3839176: [UVM][HCC][uvm_test] Update tests that assume GPU
    //                     engines can directly access sysmem
-    // Skip this test for now. To enable this test in Confidential Computing,
+    // Skip this test for now. To enable this test under SEV,
    // The GPU->CPU CE copy needs to be updated so it uses encryption when
    // CC is enabled.
    if (uvm_conf_computing_mode_enabled(gpu))
@@ -1068,7 +1068,7 @@ static NV_STATUS test_pmm_reverse_map_single(uvm_gpu_t *gpu, uvm_va_space_t *va_
    uvm_mutex_lock(&va_block->lock);

    is_resident = uvm_processor_mask_test(&va_block->resident, gpu->id) &&
-                  uvm_page_mask_full(uvm_va_block_resident_mask_get(va_block, gpu->id));
+                  uvm_page_mask_full(uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE));
    if (is_resident)
        phys_addr = uvm_va_block_gpu_phys_page_address(va_block, 0, gpu);

@@ -1154,7 +1154,7 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t
                uvm_mutex_lock(&va_block->lock);

                // Verify that all pages are populated on the GPU
-                is_resident = uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, gpu->id),
+                is_resident = uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE),
                                                        reverse_mapping->region);

                uvm_mutex_unlock(&va_block->lock);
@@ -1223,6 +1223,8 @@ static NV_STATUS test_indirect_peers(uvm_gpu_t *owning_gpu, uvm_gpu_t *accessing
    if (!chunks)
        return NV_ERR_NO_MEMORY;

+    UVM_ASSERT(!g_uvm_global.sev_enabled);
+
    TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_CHUNK_SIZE_MAX, current->mm, &verif_mem), out);
    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, owning_gpu), out);
    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, accessing_gpu), out);
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@@ -176,7 +176,9 @@ static NV_STATUS preferred_location_unmap_remote_pages(uvm_va_block_t *va_block,
    mapped_mask = uvm_va_block_map_mask_get(va_block, preferred_location);

    if (uvm_processor_mask_test(&va_block->resident, preferred_location)) {
-        const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location);
+        const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block,
+                                                                              preferred_location,
+                                                                              NUMA_NO_NODE);

        if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask, mapped_mask, resident_mask))
            goto done;
@@ -638,7 +640,7 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,

    for_each_id_in_mask(src_id, &va_block->resident) {
        NV_STATUS status;
-        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id);
+        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);

        // Calling uvm_va_block_make_resident_read_duplicate will break all
        // SetAccessedBy and remote mappings
@@ -695,7 +697,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
    // If preferred_location is set and has resident copies, give it preference
    if (UVM_ID_IS_VALID(preferred_location) &&
        uvm_processor_mask_test(&va_block->resident, preferred_location)) {
-        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location);
+        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location, NUMA_NO_NODE);
        bool is_mask_empty = !uvm_page_mask_and(break_read_duplication_pages,
                                                &va_block->read_duplicated_pages,
                                                resident_mask);
@@ -723,7 +725,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
        if (uvm_id_equal(processor_id, preferred_location))
            continue;

-        resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id);
+        resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);
        is_mask_empty = !uvm_page_mask_and(break_read_duplication_pages,
                                           &va_block->read_duplicated_pages,
                                           resident_mask);
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@@ -0,0 +1,40 @@
+/*******************************************************************************
+    Copyright (c) 2023 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#include "uvm_processors.h"
+
+int uvm_find_closest_node_mask(int src, const nodemask_t *mask)
+{
+    int nid;
+    int closest_nid = NUMA_NO_NODE;
+
+    if (node_isset(src, *mask))
+        return src;
+
+    for_each_set_bit(nid, mask->bits, MAX_NUMNODES) {
+        if (closest_nid == NUMA_NO_NODE || node_distance(src, nid) < node_distance(src, closest_nid))
+            closest_nid = nid;
+    }
+
+    return closest_nid;
+}
--- a/kernel-open/nvidia-uvm/uvm_processors.h
+++ b/kernel-open/nvidia-uvm/uvm_processors.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -26,6 +26,7 @@

 #include "uvm_linux.h"
 #include "uvm_common.h"
+#include <linux/numa.h>

 #define UVM_MAX_UNIQUE_GPU_PAIRS SUM_FROM_0_TO_N(UVM_MAX_GPUS - 1)

@@ -37,11 +38,11 @@
 // provide type safety, they are wrapped within the uvm_processor_id_t struct.
 // The range of valid identifiers needs to cover the maximum number of
 // supported GPUs on a system plus the CPU. CPU is assigned value 0, and GPUs
-// range: [1, UVM_ID_MAX_GPUS].
+// range: [1, UVM_PARENT_ID_MAX_GPUS].
 //
 // There are some functions that only expect GPU identifiers and, in order to
-// make it clearer, the uvm_gpu_id_t alias type is provided. However, as this
-// type is just a typedef of uvm_processor_id_t, there is no type checking
+// make it clearer, the uvm_parent_gpu_id_t alias type is provided. However, as
+// this type is just a typedef of uvm_processor_id_t, there is no type checking
 // performed by the compiler.
 //
 // Identifier value vs index
@@ -60,22 +61,25 @@
 // the GPU within the GPU id space (basically id - 1).
 //
 // In the diagram below, MAX_SUB is used to abbreviate
-// UVM_ID_MAX_SUB_PROCESSORS.
+// UVM_PARENT_ID_MAX_SUB_PROCESSORS.
 //
-//            |-------------------------- uvm_processor_id_t ----------------------|
-//            |                                                                    |
-//            |     |----------------------- uvm_gpu_id_t ------------------------||
-//            |     |                                                             ||
-// Proc type  | CPU | GPU          ...          GPU   ... GPU                     ||
-//            |     |                                                             ||
-// ID values  |  0  |  1           ...          i+1   ... UVM_ID_MAX_PROCESSORS-1 ||
+// TODO: Bug 4195538: uvm_parent_processor_id_t is currently but temporarily the
+//                    same as uvm_processor_id_t.
 //
-// GPU index           0           ...           i    ... UVM_ID_MAX_GPUS-1
+//            |-------------------------- uvm_parent_processor_id_t ----------------------|
+//            |                                                                           |
+//            |     |----------------------- uvm_parent_gpu_id_t ------------------------||
+//            |     |                                                                    ||
+// Proc type  | CPU | GPU          ...          GPU   ... GPU                            ||
+//            |     |                                                                    ||
+// ID values  |  0  |  1           ...          i+1   ... UVM_PARENT_ID_MAX_PROCESSORS-1 ||
+//
+// GPU index           0           ...           i    ... UVM_PARENT_ID_MAX_GPUS-1
 //                  |     |                   |     |
 //                  |     |                   |     |
-//                  |     |-------------|     |     |-----------------------------|
-//                  |                   |     |                                   |
-//                  |                   |     |                                   |
+//                  |     |-------------|     |     |------------------------------------|
+//                  |                   |     |                                          |
+//                  |                   |     |                                          |
 // GPU index           0  ... MAX_SUB-1   ...    i*MAX_SUB    ... (i+1)*MAX_SUB-1   ... UVM_GLOBAL_ID_MAX_GPUS-1
 //
 // ID values  |  0  |  1  ... MAX_SUB     ...   (i*MAX_SUB)+1 ... (i+1)*MAX_SUB     ... UVM_GLOBAL_ID_MAX_PROCESSORS-1 ||
@@ -210,7 +214,7 @@ static proc_id_t prefix_fn_mask##_find_first_id(const mask_t *mask)
                                                                                                             \
 static proc_id_t prefix_fn_mask##_find_first_gpu_id(const mask_t *mask)                                      \
 {                                                                                                            \
-    return proc_id_ctor(find_next_bit(mask->bitmap, (maxval), UVM_ID_GPU0_VALUE));                           \
+    return proc_id_ctor(find_next_bit(mask->bitmap, (maxval), UVM_PARENT_ID_GPU0_VALUE));                    \
 }                                                                                                            \
                                                                                                             \
 static proc_id_t prefix_fn_mask##_find_next_id(const mask_t *mask, proc_id_t min_id)                         \
@@ -252,7 +256,7 @@ static NvU32 prefix_fn_mask##_get_gpu_count(const mask_t *mask)
 {                                                                                                            \
    NvU32 gpu_count = prefix_fn_mask##_get_count(mask);                                                      \
                                                                                                             \
-    if (prefix_fn_mask##_test(mask, proc_id_ctor(UVM_ID_CPU_VALUE)))                                         \
+    if (prefix_fn_mask##_test(mask, proc_id_ctor(UVM_PARENT_ID_CPU_VALUE)))                                  \
        --gpu_count;                                                                                         \
                                                                                                             \
    return gpu_count;                                                                                        \
@@ -261,55 +265,55 @@ static NvU32 prefix_fn_mask##_get_gpu_count(const mask_t *mask)
 typedef struct
 {
    NvU32 val;
-} uvm_processor_id_t;
+} uvm_parent_processor_id_t;

 typedef struct
 {
    NvU32 val;
 } uvm_global_processor_id_t;

-typedef uvm_processor_id_t uvm_gpu_id_t;
+typedef uvm_parent_processor_id_t uvm_parent_gpu_id_t;
 typedef uvm_global_processor_id_t uvm_global_gpu_id_t;

 // Static value assigned to the CPU
-#define UVM_ID_CPU_VALUE      0
-#define UVM_ID_GPU0_VALUE     (UVM_ID_CPU_VALUE + 1)
+#define UVM_PARENT_ID_CPU_VALUE      0
+#define UVM_PARENT_ID_GPU0_VALUE     (UVM_PARENT_ID_CPU_VALUE + 1)

 // ID values for the CPU and first GPU, respectively; the values for both types
 // of IDs must match to enable sharing of UVM_PROCESSOR_MASK().
-#define UVM_GLOBAL_ID_CPU_VALUE  UVM_ID_CPU_VALUE
-#define UVM_GLOBAL_ID_GPU0_VALUE UVM_ID_GPU0_VALUE
+#define UVM_GLOBAL_ID_CPU_VALUE  UVM_PARENT_ID_CPU_VALUE
+#define UVM_GLOBAL_ID_GPU0_VALUE UVM_PARENT_ID_GPU0_VALUE

 // Maximum number of GPUs/processors that can be represented with the id types
-#define UVM_ID_MAX_GPUS       UVM_MAX_GPUS
-#define UVM_ID_MAX_PROCESSORS UVM_MAX_PROCESSORS
+#define UVM_PARENT_ID_MAX_GPUS       UVM_MAX_GPUS
+#define UVM_PARENT_ID_MAX_PROCESSORS UVM_MAX_PROCESSORS

-#define UVM_ID_MAX_SUB_PROCESSORS 8
+#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8

-#define UVM_GLOBAL_ID_MAX_GPUS       (UVM_MAX_GPUS * UVM_ID_MAX_SUB_PROCESSORS)
+#define UVM_GLOBAL_ID_MAX_GPUS       (UVM_PARENT_ID_MAX_GPUS * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
 #define UVM_GLOBAL_ID_MAX_PROCESSORS (UVM_GLOBAL_ID_MAX_GPUS + 1)

-#define UVM_ID_CPU            ((uvm_processor_id_t) { .val = UVM_ID_CPU_VALUE })
-#define UVM_ID_INVALID        ((uvm_processor_id_t) { .val = UVM_ID_MAX_PROCESSORS })
+#define UVM_PARENT_ID_CPU     ((uvm_parent_processor_id_t) { .val = UVM_PARENT_ID_CPU_VALUE })
+#define UVM_PARENT_ID_INVALID ((uvm_parent_processor_id_t) { .val = UVM_PARENT_ID_MAX_PROCESSORS })
 #define UVM_GLOBAL_ID_CPU     ((uvm_global_processor_id_t) { .val = UVM_GLOBAL_ID_CPU_VALUE })
 #define UVM_GLOBAL_ID_INVALID ((uvm_global_processor_id_t) { .val = UVM_GLOBAL_ID_MAX_PROCESSORS })

-#define UVM_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_ID_MAX_PROCESSORS, "id %u\n", id.val)
+#define UVM_PARENT_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_PARENT_ID_MAX_PROCESSORS, "id %u\n", id.val)

 #define UVM_GLOBAL_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_GLOBAL_ID_MAX_PROCESSORS, "id %u\n", id.val)

-static int uvm_id_cmp(uvm_processor_id_t id1, uvm_processor_id_t id2)
+static int uvm_parent_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
 {
-    UVM_ID_CHECK_BOUNDS(id1);
-    UVM_ID_CHECK_BOUNDS(id2);
+    UVM_PARENT_ID_CHECK_BOUNDS(id1);
+    UVM_PARENT_ID_CHECK_BOUNDS(id2);

    return UVM_CMP_DEFAULT(id1.val, id2.val);
 }

-static bool uvm_id_equal(uvm_processor_id_t id1, uvm_processor_id_t id2)
+static bool uvm_parent_id_equal(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
 {
-    UVM_ID_CHECK_BOUNDS(id1);
-    UVM_ID_CHECK_BOUNDS(id2);
+    UVM_PARENT_ID_CHECK_BOUNDS(id1);
+    UVM_PARENT_ID_CHECK_BOUNDS(id2);

    return id1.val == id2.val;
 }
@@ -330,30 +334,30 @@ static bool uvm_global_id_equal(uvm_global_processor_id_t id1, uvm_global_proces
    return id1.val == id2.val;
 }

-#define UVM_ID_IS_CPU(id)     uvm_id_equal(id, UVM_ID_CPU)
-#define UVM_ID_IS_INVALID(id) uvm_id_equal(id, UVM_ID_INVALID)
-#define UVM_ID_IS_VALID(id)   (!UVM_ID_IS_INVALID(id))
-#define UVM_ID_IS_GPU(id)     (!UVM_ID_IS_CPU(id) && !UVM_ID_IS_INVALID(id))
+#define UVM_PARENT_ID_IS_CPU(id)     uvm_parent_id_equal(id, UVM_PARENT_ID_CPU)
+#define UVM_PARENT_ID_IS_INVALID(id) uvm_parent_id_equal(id, UVM_PARENT_ID_INVALID)
+#define UVM_PARENT_ID_IS_VALID(id)   (!UVM_PARENT_ID_IS_INVALID(id))
+#define UVM_PARENT_ID_IS_GPU(id)     (!UVM_PARENT_ID_IS_CPU(id) && !UVM_PARENT_ID_IS_INVALID(id))

 #define UVM_GLOBAL_ID_IS_CPU(id)     uvm_global_id_equal(id, UVM_GLOBAL_ID_CPU)
 #define UVM_GLOBAL_ID_IS_INVALID(id) uvm_global_id_equal(id, UVM_GLOBAL_ID_INVALID)
 #define UVM_GLOBAL_ID_IS_VALID(id)   (!UVM_GLOBAL_ID_IS_INVALID(id))
 #define UVM_GLOBAL_ID_IS_GPU(id)     (!UVM_GLOBAL_ID_IS_CPU(id) && !UVM_GLOBAL_ID_IS_INVALID(id))

-static uvm_processor_id_t uvm_id_from_value(NvU32 val)
+static uvm_parent_processor_id_t uvm_parent_id_from_value(NvU32 val)
 {
-    uvm_processor_id_t ret = { .val = val };
+    uvm_parent_processor_id_t ret = { .val = val };

-    UVM_ID_CHECK_BOUNDS(ret);
+    UVM_PARENT_ID_CHECK_BOUNDS(ret);

    return ret;
 }

-static uvm_gpu_id_t uvm_gpu_id_from_value(NvU32 val)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_value(NvU32 val)
 {
-    uvm_gpu_id_t ret = uvm_id_from_value(val);
+    uvm_parent_gpu_id_t ret = uvm_parent_id_from_value(val);

-    UVM_ASSERT(!UVM_ID_IS_CPU(ret));
+    UVM_ASSERT(!UVM_PARENT_ID_IS_CPU(ret));

    return ret;
 }
@@ -376,34 +380,34 @@ static uvm_global_gpu_id_t uvm_global_gpu_id_from_value(NvU32 val)
    return ret;
 }

-// Create a GPU id from the given GPU id index (previously obtained via
-// uvm_id_gpu_index)
-static uvm_gpu_id_t uvm_gpu_id_from_index(NvU32 index)
+// Create a parent GPU id from the given parent GPU id index (previously
+// obtained via uvm_parent_id_gpu_index)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_index(NvU32 index)
 {
-    return uvm_gpu_id_from_value(index + UVM_ID_GPU0_VALUE);
+    return uvm_parent_gpu_id_from_value(index + UVM_PARENT_ID_GPU0_VALUE);
 }

-static uvm_processor_id_t uvm_id_next(uvm_processor_id_t id)
+static uvm_parent_processor_id_t uvm_parent_id_next(uvm_parent_processor_id_t id)
 {
    ++id.val;

-    UVM_ID_CHECK_BOUNDS(id);
+    UVM_PARENT_ID_CHECK_BOUNDS(id);

    return id;
 }

-static uvm_gpu_id_t uvm_gpu_id_next(uvm_gpu_id_t id)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_next(uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

    ++id.val;

-    UVM_ID_CHECK_BOUNDS(id);
+    UVM_PARENT_ID_CHECK_BOUNDS(id);

    return id;
 }

-// Same as uvm_gpu_id_from_index but for uvm_global_processor_id_t
+// Same as uvm_parent_gpu_id_from_index but for uvm_global_processor_id_t
 static uvm_global_gpu_id_t uvm_global_gpu_id_from_index(NvU32 index)
 {
    return uvm_global_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE);
@@ -429,11 +433,11 @@ static uvm_global_gpu_id_t uvm_global_gpu_id_next(uvm_global_gpu_id_t id)
    return id;
 }

-// This function returns the numerical value within [0, UVM_ID_MAX_PROCESSORS)
-// of the given processor id
-static NvU32 uvm_id_value(uvm_processor_id_t id)
+// This function returns the numerical value within
+// [0, UVM_PARENT_ID_MAX_PROCESSORS) of the given parent processor id.
+static NvU32 uvm_parent_id_value(uvm_parent_processor_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_VALID(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_VALID(id));

    return id.val;
 }
@@ -448,12 +452,12 @@ static NvU32 uvm_global_id_value(uvm_global_processor_id_t id)
 }

 // This function returns the index of the given GPU id within the GPU id space
-// [0, UVM_ID_MAX_GPUS)
-static NvU32 uvm_id_gpu_index(uvm_gpu_id_t id)
+// [0, UVM_PARENT_ID_MAX_GPUS)
+static NvU32 uvm_parent_id_gpu_index(uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

-    return id.val - UVM_ID_GPU0_VALUE;
+    return id.val - UVM_PARENT_ID_GPU0_VALUE;
 }

 // This function returns the index of the given GPU id within the GPU id space
@@ -465,61 +469,61 @@ static NvU32 uvm_global_id_gpu_index(const uvm_global_gpu_id_t id)
    return id.val - UVM_GLOBAL_ID_GPU0_VALUE;
 }

-static NvU32 uvm_global_id_gpu_index_from_gpu_id(const uvm_gpu_id_t id)
+static NvU32 uvm_global_id_gpu_index_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

-    return uvm_id_gpu_index(id) * UVM_ID_MAX_SUB_PROCESSORS;
+    return uvm_parent_id_gpu_index(id) * UVM_PARENT_ID_MAX_SUB_PROCESSORS;
 }

-static NvU32 uvm_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
+static NvU32 uvm_parent_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
 {
    UVM_ASSERT(UVM_GLOBAL_ID_IS_GPU(id));

-    return uvm_global_id_gpu_index(id) / UVM_ID_MAX_SUB_PROCESSORS;
+    return uvm_global_id_gpu_index(id) / UVM_PARENT_ID_MAX_SUB_PROCESSORS;
 }

-static uvm_global_gpu_id_t uvm_global_gpu_id_from_gpu_id(const uvm_gpu_id_t id)
+static uvm_global_gpu_id_t uvm_global_gpu_id_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

-    return uvm_global_gpu_id_from_index(uvm_global_id_gpu_index_from_gpu_id(id));
+    return uvm_global_gpu_id_from_index(uvm_global_id_gpu_index_from_parent_gpu_id(id));
 }

 static uvm_global_gpu_id_t uvm_global_gpu_id_from_parent_index(NvU32 index)
 {
-    UVM_ASSERT(index < UVM_MAX_GPUS);
+    UVM_ASSERT(index < UVM_PARENT_ID_MAX_GPUS);

-    return uvm_global_gpu_id_from_gpu_id(uvm_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE));
+    return uvm_global_gpu_id_from_parent_gpu_id(uvm_parent_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE));
 }

-static uvm_global_gpu_id_t uvm_global_gpu_id_from_sub_processor_index(const uvm_gpu_id_t id, NvU32 sub_index)
+static uvm_global_gpu_id_t uvm_global_gpu_id_from_sub_processor_index(const uvm_parent_gpu_id_t id, NvU32 sub_index)
 {
    NvU32 index;

-    UVM_ASSERT(sub_index < UVM_ID_MAX_SUB_PROCESSORS);
+    UVM_ASSERT(sub_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS);

-    index = uvm_global_id_gpu_index_from_gpu_id(id) + sub_index;
+    index = uvm_global_id_gpu_index_from_parent_gpu_id(id) + sub_index;
    return uvm_global_gpu_id_from_index(index);
 }

-static uvm_gpu_id_t uvm_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
 {
    UVM_ASSERT(UVM_GLOBAL_ID_IS_GPU(id));

-    return uvm_gpu_id_from_index(uvm_id_gpu_index_from_global_gpu_id(id));
+    return uvm_parent_gpu_id_from_index(uvm_parent_id_gpu_index_from_global_gpu_id(id));
 }

 static NvU32 uvm_global_id_sub_processor_index(const uvm_global_gpu_id_t id)
 {
-    return uvm_global_id_gpu_index(id) % UVM_ID_MAX_SUB_PROCESSORS;
+    return uvm_global_id_gpu_index(id) % UVM_PARENT_ID_MAX_SUB_PROCESSORS;
 }

 UVM_PROCESSOR_MASK(uvm_processor_mask_t,              \
                   uvm_processor_mask,                \
-                   UVM_ID_MAX_PROCESSORS,             \
-                   uvm_processor_id_t,                \
-                   uvm_id_from_value)
+                   UVM_PARENT_ID_MAX_PROCESSORS,      \
+                   uvm_parent_processor_id_t,         \
+                   uvm_parent_id_from_value)

 UVM_PROCESSOR_MASK(uvm_global_processor_mask_t,       \
                   uvm_global_processor_mask,         \
@@ -533,19 +537,19 @@ static bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset, co
 {
    uvm_processor_mask_t subset_gpus;
    uvm_processor_mask_copy(&subset_gpus, subset);
-    uvm_processor_mask_clear(&subset_gpus, UVM_ID_CPU);
+    uvm_processor_mask_clear(&subset_gpus, UVM_PARENT_ID_CPU);
    return uvm_processor_mask_subset(&subset_gpus, mask);
 }

 #define for_each_id_in_mask(id, mask)                                                                 \
    for ((id) = uvm_processor_mask_find_first_id(mask);                                               \
-         UVM_ID_IS_VALID(id);                                                                         \
-         (id) = uvm_processor_mask_find_next_id((mask), uvm_id_next(id)))
+         UVM_PARENT_ID_IS_VALID(id);                                                                  \
+         (id) = uvm_processor_mask_find_next_id((mask), uvm_parent_id_next(id)))

 #define for_each_gpu_id_in_mask(gpu_id, mask)                                                         \
    for ((gpu_id) = uvm_processor_mask_find_first_gpu_id((mask));                                     \
-         UVM_ID_IS_VALID(gpu_id);                                                                     \
-         (gpu_id) = uvm_processor_mask_find_next_id((mask), uvm_gpu_id_next(gpu_id)))
+         UVM_PARENT_ID_IS_VALID(gpu_id);                                                              \
+         (gpu_id) = uvm_processor_mask_find_next_id((mask), uvm_parent_gpu_id_next(gpu_id)))

 #define for_each_global_id_in_mask(id, mask)                                                          \
    for ((id) = uvm_global_processor_mask_find_first_id(mask);                                        \
@@ -559,21 +563,36 @@ static bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset, co

 // Helper to iterate over all valid gpu ids
 #define for_each_gpu_id(i)       \
-    for (i = uvm_gpu_id_from_value(UVM_ID_GPU0_VALUE); UVM_ID_IS_VALID(i); i = uvm_gpu_id_next(i))
+    for (i = uvm_parent_gpu_id_from_value(UVM_PARENT_ID_GPU0_VALUE); UVM_PARENT_ID_IS_VALID(i); i = uvm_parent_gpu_id_next(i))
 #define for_each_global_gpu_id(i)  \
    for (i = uvm_global_gpu_id_from_value(UVM_GLOBAL_ID_GPU0_VALUE); UVM_GLOBAL_ID_IS_VALID(i); i = uvm_global_gpu_id_next(i))

 #define for_each_global_sub_processor_id_in_gpu(id, i) \
-    for (i = uvm_global_gpu_id_from_gpu_id(id); \
+    for (i = uvm_global_gpu_id_from_parent_gpu_id(id); \
         UVM_GLOBAL_ID_IS_VALID(i) && \
-         (uvm_global_id_value(i) < uvm_global_id_value(uvm_global_gpu_id_from_gpu_id(id)) + UVM_ID_MAX_SUB_PROCESSORS); \
+         (uvm_global_id_value(i) < uvm_global_id_value(uvm_global_gpu_id_from_parent_gpu_id(id)) + UVM_PARENT_ID_MAX_SUB_PROCESSORS); \
         i = uvm_global_gpu_id_next(i))

 // Helper to iterate over all valid gpu ids
-#define for_each_processor_id(i) for (i = UVM_ID_CPU; UVM_ID_IS_VALID(i); i = uvm_id_next(i))
+#define for_each_processor_id(i) for (i = UVM_PARENT_ID_CPU; UVM_PARENT_ID_IS_VALID(i); i = uvm_parent_id_next(i))

 #define for_each_global_id(i) for (i = UVM_GLOBAL_ID_CPU; UVM_GLOBAL_ID_IS_VALID(i); i = uvm_global_id_next(i))

+// Find the node in mask with the shorted distance (as returned by
+// node_distance) for src.
+// Note that the search is inclusive of src.
+// If mask has no bits set, NUMA_NO_NODE is returned.
+int uvm_find_closest_node_mask(int src, const nodemask_t *mask);
+
+// Iterate over all nodes in mask with increasing distance from src.
+// Note that this iterator is destructive of the mask.
+#define for_each_closest_uvm_node(nid, src, mask)                                                                      \
+    for ((nid) = uvm_find_closest_node_mask((src), &(mask));                                                           \
+         (nid) != NUMA_NO_NODE;                                                                                        \
+         node_clear((nid), (mask)), (nid) = uvm_find_closest_node_mask((src), &(mask)))
+
+#define for_each_possible_uvm_node(nid) for_each_node_mask((nid), node_possible_map)
+
 static bool uvm_processor_uuid_eq(const NvProcessorUuid *uuid1, const NvProcessorUuid *uuid2)
 {
    return memcmp(uuid1, uuid2, sizeof(*uuid1)) == 0;
@@ -585,4 +604,78 @@ static void uvm_processor_uuid_copy(NvProcessorUuid *dst, const NvProcessorUuid
    memcpy(dst, src, sizeof(*dst));
 }

+// TODO: Bug 4195538: [uvm][multi-SMC] Get UVM internal data structures ready to
+// meet multi-SMC requirements. Temporary aliases, they must be removed once
+// the data structures are converted.
+typedef uvm_parent_processor_id_t uvm_processor_id_t;
+typedef uvm_parent_gpu_id_t uvm_gpu_id_t;
+
+#define UVM_ID_CPU_VALUE                 UVM_PARENT_ID_CPU_VALUE
+#define UVM_ID_GPU0_VALUE                UVM_PARENT_ID_GPU0_VALUE
+#define UVM_ID_MAX_GPUS                  UVM_PARENT_ID_MAX_GPUS
+#define UVM_ID_MAX_PROCESSORS            UVM_PARENT_ID_MAX_PROCESSORS
+#define UVM_ID_MAX_SUB_PROCESSORS        UVM_PARENT_ID_MAX_SUB_PROCESSORS
+#define UVM_ID_CPU                       UVM_PARENT_ID_CPU
+#define UVM_ID_INVALID                   UVM_PARENT_ID_INVALID
+
+static int uvm_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
+{
+    return UVM_CMP_DEFAULT(id1.val, id2.val);
+}
+
+static bool uvm_id_equal(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
+{
+    return uvm_parent_id_equal(id1, id2);
+}
+
+#define UVM_ID_IS_CPU(id)     uvm_id_equal(id, UVM_ID_CPU)
+#define UVM_ID_IS_INVALID(id) uvm_id_equal(id, UVM_ID_INVALID)
+#define UVM_ID_IS_VALID(id)   (!UVM_ID_IS_INVALID(id))
+#define UVM_ID_IS_GPU(id)     (!UVM_ID_IS_CPU(id) && !UVM_ID_IS_INVALID(id))
+
+static uvm_parent_gpu_id_t uvm_gpu_id_from_value(NvU32 val)
+{
+    return uvm_parent_gpu_id_from_value(val);
+}
+
+static NvU32 uvm_id_value(uvm_parent_processor_id_t id)
+{
+    return uvm_parent_id_value(id);
+}
+
+static NvU32 uvm_id_gpu_index(uvm_parent_gpu_id_t id)
+{
+    return uvm_parent_id_gpu_index(id);
+}
+
+static NvU32 uvm_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
+{
+    return uvm_parent_id_gpu_index_from_global_gpu_id(id);
+}
+
+static uvm_parent_gpu_id_t uvm_gpu_id_from_index(NvU32 index)
+{
+    return uvm_parent_gpu_id_from_index(index);
+}
+
+static uvm_parent_gpu_id_t uvm_gpu_id_next(uvm_parent_gpu_id_t id)
+{
+    return uvm_parent_gpu_id_next(id);
+}
+
+static uvm_parent_gpu_id_t uvm_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
+{
+    return uvm_parent_gpu_id_from_global_gpu_id(id);
+}
+
+static NvU32 uvm_global_id_gpu_index_from_gpu_id(const uvm_parent_gpu_id_t id)
+{
+    return uvm_global_id_gpu_index_from_parent_gpu_id(id);
+}
+
+static uvm_global_gpu_id_t uvm_global_gpu_id_from_gpu_id(const uvm_parent_gpu_id_t id)
+{
+    return uvm_global_gpu_id_from_parent_gpu_id(id);
+}
+
 #endif
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@@ -106,26 +106,6 @@ static NV_STATUS uvm_test_nv_kthread_q(UVM_TEST_NV_KTHREAD_Q_PARAMS *params, str
    return NV_ERR_INVALID_STATE;
 }

-static NV_STATUS uvm_test_numa_get_closest_cpu_node_to_gpu(UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU_PARAMS *params,
-                                                           struct file *filp)
-{
-    uvm_gpu_t *gpu;
-    NV_STATUS status;
-    uvm_rm_user_object_t user_rm_va_space = {
-        .rm_control_fd = -1,
-        .user_client = params->client,
-        .user_object = params->smc_part_ref
-    };
-
-    status = uvm_gpu_retain_by_uuid(&params->gpu_uuid, &user_rm_va_space, &gpu);
-    if (status != NV_OK)
-        return status;
-
-    params->node_id = gpu->parent->closest_cpu_numa_node;
-    uvm_gpu_release(gpu);
-    return NV_OK;
-}
-
 // Callers of this function should ensure that node is not NUMA_NO_NODE in order
 // to avoid overrunning the kernel's node to cpumask map.
 static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
@@ -307,8 +287,6 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_DRAIN_REPLAYABLE_FAULTS,      uvm_test_drain_replayable_faults);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMA_GET_BATCH_SIZE,           uvm_test_pma_get_batch_size);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_QUERY_PMA_STATS,          uvm_test_pmm_query_pma_stats);
-        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU,
-                                       uvm_test_numa_get_closest_cpu_node_to_gpu);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_NUMA_CHECK_AFFINITY,          uvm_test_numa_check_affinity);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_ADD_DUMMY_THREAD_CONTEXTS,
                                       uvm_test_va_space_add_dummy_thread_contexts);
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@@ -561,6 +561,22 @@ typedef struct
 // user_pages_allocation_retry_force_count, but the injection point simulates
 // driver metadata allocation failure.
 //
+// cpu_chunk_allocation_target_id and cpu_chunk_allocation_actual_id are used
+// to control the NUMA node IDs for CPU chunk allocations, specifically for
+// testing overlapping CPU chunk allocations.
+//
+// Currently, uvm_api_migrate() does not pass the preferred CPU NUMA node to for
+// managed memory so it is not possible to request a specific node.
+// cpu_chunk_allocation_target_id is used to request the allocation be made on
+// specific node. On the other hand, cpu_chunk_allocation_actual_id is the node
+// on which the allocation will actually be made.
+//
+// The two parameters can be used to force a CPU chunk allocation to overlap a
+// previously allocated chunk.
+//
+// Please note that even when specifying cpu_cpu_allocation_actual_id, the
+// kernel may end up allocating on a different node.
+//
 // Error returns:
 // NV_ERR_INVALID_ADDRESS
 //  - lookup_address doesn't match a UVM range
@@ -571,6 +587,8 @@ typedef struct
    NvU32     page_table_allocation_retry_force_count;  // In
    NvU32     user_pages_allocation_retry_force_count;  // In
    NvU32     cpu_chunk_allocation_size_mask;           // In
+    NvS32     cpu_chunk_allocation_target_id;           // In
+    NvS32     cpu_chunk_allocation_actual_id;           // In
    NvU32     cpu_pages_allocation_error_count;         // In
    NvBool    eviction_error;                           // In
    NvBool    populate_error;                           // In
@@ -604,6 +622,10 @@ typedef struct
    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS];                    // Out
    NvU32                           resident_on_count;                                  // Out

+    // If the memory is resident on the CPU, the NUMA node on which the page
+    // is resident. Otherwise, -1.
+    NvS32                           resident_nid;                                       // Out
+
    // The size of the physical allocation backing lookup_address. Only the
    // system-page-sized portion of this allocation which contains
    // lookup_address is guaranteed to be resident on the corresponding
@@ -1168,19 +1190,6 @@ typedef struct
    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS;

-#define UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU        UVM_TEST_IOCTL_BASE(77)
-typedef struct
-{
-    NvProcessorUuid                 gpu_uuid;                                           // In
-    NvHandle                        client;                                             // In
-    NvHandle                        smc_part_ref;                                       // In
-
-    // On kernels with NUMA support, this entry contains the closest CPU NUMA
-    // node to this GPU. Otherwise, the value will be -1.
-    NvS32                           node_id;                                            // Out
-    NV_STATUS                       rmStatus;                                           // Out
-} UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU_PARAMS;
-
 // Test whether the bottom halves have run on the correct CPUs based on the
 // NUMA node locality of the GPU.
 //
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -44,6 +44,7 @@

 #include <linux/mmu_notifier.h>
 #include <linux/wait.h>
+#include <linux/nodemask.h>

 // VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations
 // (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED):
@@ -229,6 +230,42 @@ typedef struct

 } uvm_va_block_gpu_state_t;

+typedef struct
+{
+    // Per-page residency bit vector, used for fast traversal of resident
+    // pages.
+    //
+    // A set bit means the CPU has a coherent copy of the physical page
+    // resident in the NUMA node's memory, and that a CPU chunk for the
+    // corresponding page index has been allocated. This does not mean that
+    // the coherent copy is currently mapped anywhere, however. A page may be
+    // resident on multiple processors (but not multiple CPU NUMA nodes) when in
+    // read-duplicate mode.
+    //
+    // A cleared bit means the CPU NUMA node does not have a coherent copy of
+    // that page resident. A CPU chunk for the corresponding page index may or
+    // may not have been allocated. If the chunk is present, it's a cached chunk
+    // which can be reused in the future.
+    //
+    // Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
+    // smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
+    // overhead on the whole.
+    uvm_page_mask_t resident;
+
+    // Per-page allocation bit vector.
+    //
+    // A set bit means that a CPU chunk has been allocated for the
+    // corresponding page index on this NUMA node.
+    uvm_page_mask_t allocated;
+
+    // CPU memory chunks represent physically contiguous CPU memory
+    // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
+    // This member is meant to hold an opaque value indicating the CPU
+    // chunk storage method. For more details on CPU chunk storage,
+    // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
+    unsigned long chunks;
+} uvm_va_block_cpu_node_state_t;
+
 // TODO: Bug 1766180: Worst-case we could have one of these per system page.
 //       Options:
 //       1) Rely on the OOM killer to prevent the user from trying to do that
@@ -306,38 +343,30 @@ struct uvm_va_block_struct

    struct
    {
-        // Per-page residency bit vector, used for fast traversal of resident
-        // pages.
-        //
-        // A set bit means the CPU has a coherent copy of the physical page
-        // resident in its memory, and that the corresponding entry in the pages
-        // array is present. This does not mean that the coherent copy is
-        // currently mapped anywhere, however. A page may be resident on
-        // multiple processors when in read-duplicate mode.
-        //
-        // A cleared bit means the CPU does not have a coherent copy of that
-        // page resident. The corresponding entry in the pages array may or may
-        // not present. If the entry is present, it's a cached page which can be
-        // reused in the future.
-        //
-        // Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
-        // smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
-        // overhead on the whole.
-        uvm_page_mask_t resident;
-
-        // CPU memory chunks represent physically contiguous CPU memory
-        // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
-        // This member is meant to hold an opaque value indicating the CPU
-        // chunk storage method. For more details on CPU chunk storage,
-        // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
-        unsigned long chunks;
+        // Per-NUMA node tracking of CPU allocations.
+        // This is a dense array with one entry per possible NUMA node.
+        uvm_va_block_cpu_node_state_t **node_state;

        // Per-page allocation bit vector.
        //
        // A set bit means that a CPU page has been allocated for the
-        // corresponding page index.
+        // corresponding page index on at least one CPU NUMA node.
        uvm_page_mask_t allocated;

+        // Per-page residency bit vector. See
+        // uvm_va_block_cpu_numa_state_t::resident for a detailed description.
+        // This mask is a cumulative mask (logical OR) of all
+        // uvm_va_block_cpu_node_state_t::resident masks. It is meant to be used
+        // only for fast testing of page residency when it matters only if the
+        // page is resident on the CPU.
+        //
+        // Note that this mask cannot be set directly as this will cause
+        // inconsistencies between this mask and the per-NUMA residency masks.
+        // In order to properly maintain consistency between the per-NUMA masks
+        // and this one, uvm_va_block_cpu_[set|clear]_residency_*() helpers
+        // should be used.
+        uvm_page_mask_t resident;
+
        // Per-page mapping bit vectors, one per bit we need to track. These are
        // used for fast traversal of valid mappings in the block. These contain
        // all non-address bits needed to establish a virtual mapping on this
@@ -418,7 +447,8 @@ struct uvm_va_block_struct
    uvm_page_mask_t read_duplicated_pages;

    // Mask to keep track of the pages that are not mapped on any non-UVM-Lite
-    // processor.
+    // processor. This mask is not used for HMM because the CPU can map pages
+    // at any time without notifying the driver.
    //     0: Page is definitely not mapped by any processors
    //     1: Page may or may not be mapped by a processor
    //
@@ -525,6 +555,13 @@ struct uvm_va_block_wrapper_struct
        // a successful migration if this error flag is cleared.
        NvU32 inject_cpu_pages_allocation_error_count;

+        // A NUMA node ID on which any CPU chunks will be allocated from.
+        // This will override any other setting and/or policy.
+        // Note that the kernel is still free to allocate from any of the
+        // nodes in the thread's policy.
+        int cpu_chunk_allocation_target_id;
+        int cpu_chunk_allocation_actual_id;
+
        // Force the next eviction attempt on this block to fail. Used for
        // testing only.
        bool inject_eviction_error;
@@ -668,17 +705,12 @@ void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context);
 // Initialization of an already-allocated uvm_va_block_context_t.
 //
 // mm is used to initialize the value of va_block_context->mm. NULL is allowed.
-static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
-{
-    UVM_ASSERT(va_block_context);
+void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm);

-    // Write garbage into the VA Block context to ensure that the UVM code
-    // clears masks appropriately
-    if (UVM_IS_DEBUG())
-        memset(va_block_context, 0xff, sizeof(*va_block_context));
-
-    va_block_context->mm = mm;
-}
+// Return the preferred NUMA node ID for the block's policy.
+// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
+// is returned.
+int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context);

 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
 //       and page masks could simplify the below APIs and their implementations
@@ -734,6 +766,9 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
 // those masks. It is the caller's responsiblity to zero the masks or
 // not first.
 //
+// va_block_context->make_resident.dest_nid is used to guide the NUMA node for
+// CPU allocations.
+//
 // Notably any status other than NV_OK indicates that the block's lock might
 // have been unlocked and relocked.
 //
@@ -1377,8 +1412,14 @@ static uvm_va_block_test_t *uvm_va_block_get_test(uvm_va_block_t *va_block)

 // Get the page residency mask for a processor if it's known to be there.
 //
+// If the processor is the CPU, the residency mask for the NUMA node ID
+// specified by nid will be returned (see
+// uvm_va_block_cpu_node_state_t::resident). If nid is NUMA_NO_NODE,
+// the cumulative CPU residency mask will be returned (see
+// uvm_va_block_t::cpu::resident).
+//
 // If the processor is a GPU, this will assert that GPU state is indeed present.
-uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
+uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid);

 // Get the page mapped mask for a processor. The returned mask cannot be
 // directly modified by the caller
@@ -1386,6 +1427,13 @@ uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_proce
 // If the processor is a GPU, this will assert that GPU state is indeed present.
 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);

+// Return a mask of non-UVM-Lite pages that are unmapped within the given
+// region.
+// Locking: The block lock must be held.
+void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
+                                     uvm_va_block_region_t region,
+                                     uvm_page_mask_t *out_mask);
+
 // VA block lookup functions. There are a number of permutations which might be
 // useful, such as looking up the block from {va_space, va_range} x {addr,
 // block index}. The ones implemented here and in uvm_va_range.h support the
@@ -1756,17 +1804,28 @@ static bool uvm_page_mask_full(const uvm_page_mask_t *mask)
    return bitmap_full(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }

-static bool uvm_page_mask_and(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
+static void uvm_page_mask_fill(uvm_page_mask_t *mask)
+{
+    bitmap_fill(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
+}
+
+static bool uvm_page_mask_and(uvm_page_mask_t *mask_out,
+                              const uvm_page_mask_t *mask_in1,
+                              const uvm_page_mask_t *mask_in2)
 {
    return bitmap_and(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }

-static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
+static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out,
+                                 const uvm_page_mask_t *mask_in1,
+                                 const uvm_page_mask_t *mask_in2)
 {
    return bitmap_andnot(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }

-static void uvm_page_mask_or(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
+static void uvm_page_mask_or(uvm_page_mask_t *mask_out,
+                             const uvm_page_mask_t *mask_in1,
+                             const uvm_page_mask_t *mask_in2)
 {
    bitmap_or(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }
@@ -2036,30 +2095,49 @@ uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_blo
                                                          uvm_page_index_t page_index,
                                                          uvm_processor_id_t processor);

+// Mark CPU page page_index as resident on NUMA node specified by nid.
+// nid cannot be NUMA_NO_NODE.
+void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
+
+// Test if a CPU page is resident on NUMA node nid. If nid is NUMA_NO_NODE,
+// the function will return True if the page is resident on any CPU NUMA node.
+bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
+
+// Test if all pages in region are resident on NUMA node nid. If nid is
+// NUMA_NO_NODE, the function will test if the pages in the region are
+// resident on any CPU NUMA node.
+bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region);
+
 // Insert a CPU chunk at the given page_index into the va_block.
 // Locking: The va_block lock must be held.
-NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
-                                        uvm_cpu_chunk_t *chunk,
-                                        uvm_page_index_t page_index);
+NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

 // Remove a CPU chunk at the given page_index from the va_block.
+// nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
-                                     uvm_page_index_t page_index);
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);

-// Return the CPU chunk at the given page_index from the va_block.
+// Return the CPU chunk at the given page_index on the given NUMA node from the
+// va_block. nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block,
+                                                  int nid,
                                                  uvm_page_index_t page_index);

-// Return the CPU chunk at the given page_index from the va_block.
+// Return the struct page * from the chunk corresponding to the given page_index
 // Locking: The va_block lock must be held.
-struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block,
-                                        uvm_page_index_t page_index);
+struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
+
+// Return the struct page * of the resident chunk at the given page_index from
+// the va_block. The given page_index must be resident on the CPU.
+// Locking: The va_block lock must be held.
+struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index);

 // Physically map a CPU chunk so it is DMA'able from all registered GPUs.
+// nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
+                                             uvm_cpu_chunk_t *chunk,
                                             uvm_page_index_t page_index);

 // Physically unmap a CPU chunk from all registered GPUs.
--- a/kernel-open/nvidia-uvm/uvm_va_block_types.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block_types.h
@@ -30,6 +30,7 @@
 #include "uvm_forward_decl.h"

 #include <linux/migrate.h>
+#include <linux/nodemask.h>

 // UVM_VA_BLOCK_BITS is 21, meaning the maximum block size is 2MB. Rationale:
 // - 2MB matches the largest Pascal GPU page size so it's a natural fit
@@ -145,6 +146,18 @@ typedef struct
    unsigned count;
 } uvm_prot_page_mask_array_t[UVM_PROT_MAX - 1];

+typedef struct
+{
+    // A per-NUMA-node array of page masks (size num_possible_nodes()) that hold
+    // the set of CPU pages used by the migration operation.
+    uvm_page_mask_t **node_masks;
+
+    // Node mask used to iterate over the page masks above.
+    // If a node's bit is set, it means that the page mask given by
+    // node_to_index() in node_masks has set pages.
+    nodemask_t nodes;
+} uvm_make_resident_page_tracking_t;
+
 // In the worst case some VA block operations require more state than we should
 // reasonably store on the stack. Instead, we dynamically allocate VA block
 // contexts. These are used for almost all operations on VA blocks.
@@ -159,6 +172,9 @@ typedef struct
    // this block_context.
    uvm_page_mask_t scratch_page_mask;

+    // Scratch node mask. This follows the same rules as scratch_page_mask;
+    nodemask_t scratch_node_mask;
+
    // State used by uvm_va_block_make_resident
    struct uvm_make_resident_context_struct
    {
@@ -181,10 +197,24 @@ typedef struct
        // Used to perform ECC checks after the migration is done.
        uvm_processor_mask_t all_involved_processors;

+        // Page mask used to compute the set of CPU pages for each CPU node.
+        uvm_page_mask_t node_pages_mask;
+
        // Final residency for the data. This is useful for callees to know if
        // a migration is part of a staging copy
        uvm_processor_id_t dest_id;

+        // Final residency NUMA node if the migration destination is the CPU.
+        int dest_nid;
+
+        // This structure is used to track CPU pages used for migrations on
+        // a per-NUMA node basis.
+        //
+        // The pages could be used for either migrations to the CPU (used to
+        // track the destination CPU pages) or staging copies (used to track
+        // the CPU pages used for the staging).
+        uvm_make_resident_page_tracking_t cpu_pages_used;
+
        // Event that triggered the call
        uvm_make_resident_cause_t cause;
    } make_resident;
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@@ -31,6 +31,7 @@

 const uvm_va_policy_t uvm_va_policy_default = {
    .preferred_location = UVM_ID_INVALID,
+    .preferred_nid = NUMA_NO_NODE,
    .read_duplication = UVM_READ_DUPLICATION_UNSET,
 };

--- a/kernel-open/nvidia-uvm/uvm_va_policy.h
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.h
@@ -24,6 +24,7 @@
 #ifndef __UVM_VA_POLICY_H__
 #define __UVM_VA_POLICY_H__

+#include <linux/numa.h>
 #include "uvm_linux.h"
 #include "uvm_forward_decl.h"
 #include "uvm_processors.h"
@@ -62,6 +63,18 @@ struct uvm_va_policy_struct
    // This is set to UVM_ID_INVALID if no preferred location is set.
    uvm_processor_id_t preferred_location;

+    // If the preferred location is the CPU, this is either the preferred NUMA
+    // node ID or NUMA_NO_NODE to indicate that there is no preference among
+    // nodes.
+    // If preferred_location is a GPU, preferred_nid will be used if CPU
+    // pages have to be allocated for any staging copies. Otherwise, it is
+    // not used.
+    //
+    // TODO: Bug 4148100 - Preferred_location and preferred_nid should be
+    //       combined into a new type that combines the processor and NUMA node
+    //       ID.
+    int preferred_nid;
+
    // Mask of processors that are accessing this VA range and should have
    // their page tables updated to access the (possibly remote) pages.
    uvm_processor_mask_t accessed_by;
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -162,9 +162,7 @@ static uvm_va_range_t *uvm_va_range_alloc_managed(uvm_va_space_t *va_space, NvU6
        goto error;

    va_range->type = UVM_VA_RANGE_TYPE_MANAGED;
-
-    uvm_va_range_get_policy(va_range)->read_duplication = UVM_READ_DUPLICATION_UNSET;
-    uvm_va_range_get_policy(va_range)->preferred_location = UVM_ID_INVALID;
+    va_range->managed.policy = uvm_va_policy_default;

    va_range->blocks = uvm_kvmalloc_zero(uvm_va_range_num_blocks(va_range) * sizeof(va_range->blocks[0]));
    if (!va_range->blocks) {
@@ -376,7 +374,7 @@ NV_STATUS uvm_va_range_create_semaphore_pool(uvm_va_space_t *va_space,
        if (status != NV_OK)
            goto error;

-        if (i == 0 && g_uvm_global.conf_computing_enabled)
+        if (i == 0 && g_uvm_global.sev_enabled)
            mem_alloc_params.dma_owner = gpu;

        if (attrs.is_cacheable) {
@@ -835,7 +833,7 @@ static void uvm_va_range_disable_peer_external(uvm_va_range_t *va_range,
    range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
    uvm_mutex_lock(&range_tree->lock);
    uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, mapping_gpu) {
-        if (ext_map->owning_gpu == owning_gpu && !ext_map->is_sysmem) {
+        if (ext_map->owning_gpu == owning_gpu && (!ext_map->is_sysmem || ext_map->is_egm)) {
            UVM_ASSERT(deferred_free_list);
            uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list);
        }
@@ -1807,7 +1805,7 @@ NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params,
    if (params->gpuAttributesCount > UVM_MAX_GPUS)
        return NV_ERR_INVALID_ARGUMENT;

-    if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0)
+    if (g_uvm_global.sev_enabled && params->gpuAttributesCount == 0)
        return NV_ERR_INVALID_ARGUMENT;

    // The mm needs to be locked in order to remove stale HMM va_blocks.
--- a/kernel-open/nvidia-uvm/uvm_va_range.h
+++ b/kernel-open/nvidia-uvm/uvm_va_range.h
@@ -189,6 +189,7 @@ typedef struct
    // sysmem was originally allocated under. For the allocation to remain valid
    // we need to prevent the GPU from going away, similarly to P2P mapped
    // memory.
+    // Similarly for EGM memory.
    //
    // This field is not used for sparse mappings as they don't have an
    // allocation and, hence, owning GPU.
@@ -208,6 +209,9 @@ typedef struct
    // backing.
    bool is_sysmem;

+    // EGM memory. If true is_sysmem also has to be true and owning_gpu
+    // has to be valid.
+    bool is_egm;
    // GPU page tables mapping the allocation
    uvm_page_table_range_vec_t pt_range_vec;

--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -222,6 +222,12 @@ NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va
    uvm_down_write_mmap_lock(current->mm);
    uvm_va_space_down_write(va_space);

+    va_space->va_block_context = uvm_va_block_context_alloc(NULL);
+    if (!va_space->va_block_context) {
+        status = NV_ERR_NO_MEMORY;
+        goto fail;
+    }
+
    status = uvm_perf_init_va_space_events(va_space, &va_space->perf_events);
    if (status != NV_OK)
        goto fail;
@@ -258,6 +264,7 @@ NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va
 fail:
    uvm_perf_heuristics_unload(va_space);
    uvm_perf_destroy_va_space_events(&va_space->perf_events);
+    uvm_va_block_context_free(va_space->va_block_context);
    uvm_va_space_up_write(va_space);
    uvm_up_write_mmap_lock(current->mm);

@@ -457,8 +464,6 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
        uvm_va_range_destroy(va_range, &deferred_free_list);
    }

-    uvm_hmm_va_space_destroy(va_space);
-
    uvm_range_group_radix_tree_destroy(va_space);

    // Unregister all GPUs in the VA space. Note that this does not release the
@@ -466,11 +471,17 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
    for_each_va_space_gpu(gpu, va_space)
        unregister_gpu(va_space, gpu, NULL, &deferred_free_list, NULL);

+    uvm_hmm_va_space_destroy(va_space);
+
    uvm_perf_heuristics_unload(va_space);
    uvm_perf_destroy_va_space_events(&va_space->perf_events);

    va_space_remove_dummy_thread_contexts(va_space);

+    // Destroy the VA space's block context node tracking after all ranges have
+    // been destroyed as the VA blocks may reference it.
+    uvm_va_block_context_free(va_space->va_block_context);
+
    uvm_va_space_up_write(va_space);

    UVM_ASSERT(uvm_processor_mask_empty(&va_space->registered_gpus));
@@ -688,7 +699,7 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,

    // Mixing coherent and non-coherent GPUs is not supported
    for_each_va_space_gpu(other_gpu, va_space) {
-        if (uvm_gpu_is_coherent(gpu->parent) != uvm_gpu_is_coherent(other_gpu->parent)) {
+        if (uvm_parent_gpu_is_coherent(gpu->parent) != uvm_parent_gpu_is_coherent(other_gpu->parent)) {
            status = NV_ERR_INVALID_DEVICE;
            goto done;
        }
@@ -729,7 +740,7 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
        processor_mask_array_set(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
    }

-    if (uvm_gpu_is_coherent(gpu->parent)) {
+    if (uvm_parent_gpu_is_coherent(gpu->parent)) {
        processor_mask_array_set(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);

        if (gpu->mem_info.numa.enabled) {
@@ -1540,7 +1551,6 @@ static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
    atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);

    uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu_va_space->gpu->id);
-    uvm_processor_mask_clear_atomic(&va_space->needs_fault_buffer_flush, gpu_va_space->gpu->id);
    va_space->gpu_va_spaces[uvm_id_gpu_index(gpu_va_space->gpu->id)] = NULL;
    gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
 }
@@ -1610,14 +1620,14 @@ NV_STATUS uvm_va_space_unregister_gpu_va_space(uvm_va_space_t *va_space, const N
    return status;
 }

-bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
+bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
 {
    size_t table_index;

+    UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu0->id));
    UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu1->id));
-    UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu2->id));

-    table_index = uvm_gpu_peer_table_index(gpu1->id, gpu2->id);
+    table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
    return !!test_bit(table_index, va_space->enabled_peers);
 }

@@ -2073,9 +2083,16 @@ NV_STATUS uvm_service_block_context_init(void)
    // Pre-allocate some fault service contexts for the CPU and add them to the global list
    while (num_preallocated_contexts-- > 0) {
        uvm_service_block_context_t *service_context = uvm_kvmalloc(sizeof(*service_context));
+
        if (!service_context)
            return NV_ERR_NO_MEMORY;

+        service_context->block_context = uvm_va_block_context_alloc(NULL);
+        if (!service_context->block_context) {
+            uvm_kvfree(service_context);
+            return NV_ERR_NO_MEMORY;
+        }
+
        list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
    }

@@ -2089,6 +2106,7 @@ void uvm_service_block_context_exit(void)
    // Free fault service contexts for the CPU and add clear the global list
    list_for_each_entry_safe(service_context, service_context_tmp, &g_cpu_service_block_context_list,
                             cpu_fault.service_context_list) {
+        uvm_va_block_context_free(service_context->block_context);
        uvm_kvfree(service_context);
    }
    INIT_LIST_HEAD(&g_cpu_service_block_context_list);
@@ -2110,8 +2128,17 @@ static uvm_service_block_context_t *service_block_context_cpu_alloc(void)

    uvm_spin_unlock(&g_cpu_service_block_context_list_lock);

-    if (!service_context)
+    if (!service_context) {
        service_context = uvm_kvmalloc(sizeof(*service_context));
+        service_context->block_context = uvm_va_block_context_alloc(NULL);
+        if (!service_context->block_context) {
+            uvm_kvfree(service_context);
+            service_context = NULL;
+        }
+    }
+    else {
+        uvm_va_block_context_init(service_context->block_context, NULL);
+    }

    return service_context;
 }
@@ -2137,6 +2164,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
    NV_STATUS status = uvm_global_get_status();
    bool tools_enabled;
    bool major_fault = false;
+    bool is_remote_mm = false;
    uvm_service_block_context_t *service_context;
    uvm_global_processor_mask_t gpus_to_check_for_ecc;

@@ -2177,7 +2205,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
    // mmap_lock held on the CPU fault path, so tell the fault handler to use
    // that one. current->mm might differ if we're on the access_process_vm
    // (ptrace) path or if another driver is calling get_user_pages.
-    service_context->block_context.mm = vma->vm_mm;
+    service_context->block_context->mm = vma->vm_mm;

    // The mmap_lock might be held in write mode, but the mode doesn't matter
    // for the purpose of lock ordering and we don't rely on it being in write
@@ -2216,25 +2244,32 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
            uvm_tools_record_throttling_end(va_space, fault_addr, UVM_ID_CPU);

        if (is_hmm) {
-            // Note that normally we should find a va_block for the faulting
-            // address because the block had to be created when migrating a
-            // page to the GPU and a device private PTE inserted into the CPU
-            // page tables in order for migrate_to_ram() to be called. Not
-            // finding it means the PTE was remapped to a different virtual
-            // address with mremap() so create a new va_block if needed.
-            status = uvm_hmm_va_block_find_create(va_space,
-                                                  fault_addr,
-                                                  &service_context->block_context.hmm.vma,
-                                                  &va_block);
-            if (status != NV_OK)
-                break;
+            if (va_space->va_space_mm.mm == vma->vm_mm) {
+                // Note that normally we should find a va_block for the faulting
+                // address because the block had to be created when migrating a
+                // page to the GPU and a device private PTE inserted into the CPU
+                // page tables in order for migrate_to_ram() to be called. Not
+                // finding it means the PTE was remapped to a different virtual
+                // address with mremap() so create a new va_block if needed.
+                status = uvm_hmm_va_block_find_create(va_space,
+                                                      fault_addr,
+                                                      &service_context->block_context->hmm.vma,
+                                                      &va_block);
+                if (status != NV_OK)
+                    break;

-            UVM_ASSERT(service_context->block_context.hmm.vma == vma);
-            status = uvm_hmm_migrate_begin(va_block);
-            if (status != NV_OK)
-                break;
+                UVM_ASSERT(service_context->block_context->hmm.vma == vma);
+                status = uvm_hmm_migrate_begin(va_block);
+                if (status != NV_OK)
+                    break;

-            service_context->cpu_fault.vmf = vmf;
+                service_context->cpu_fault.vmf = vmf;
+            }
+            else {
+                is_remote_mm = true;
+                status = uvm_hmm_remote_cpu_fault(vmf);
+                break;
+            }
        }
        else {
            status = uvm_va_block_find_create_managed(va_space, fault_addr, &va_block);
@@ -2265,7 +2300,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,

    tools_enabled = va_space->tools.enabled;

-    if (status == NV_OK) {
+    if (status == NV_OK && !is_remote_mm) {
        uvm_va_space_global_gpus_in_mask(va_space,
                                         &gpus_to_check_for_ecc,
                                         &service_context->cpu_fault.gpus_to_check_for_ecc);
@@ -2275,7 +2310,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
    uvm_va_space_up_read(va_space);
    uvm_record_unlock_mmap_lock_read(vma->vm_mm);

-    if (status == NV_OK) {
+    if (status == NV_OK && !is_remote_mm) {
        status = uvm_global_mask_check_ecc_error(&gpus_to_check_for_ecc);
        uvm_global_mask_release(&gpus_to_check_for_ecc);
    }
--- a/kernel-open/nvidia-uvm/uvm_va_space.h
+++ b/kernel-open/nvidia-uvm/uvm_va_space.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -253,17 +253,6 @@ struct uvm_va_space_struct
    // corrupting state.
    uvm_processor_mask_t gpu_unregister_in_progress;

-    // On VMA destruction, the fault buffer needs to be flushed for all the GPUs
-    // registered in the VA space to avoid leaving stale entries of the VA range
-    // that is going to be destroyed. Otherwise, these fault entries can be
-    // attributed to new VA ranges reallocated at the same addresses. However,
-    // uvm_vm_close is called with mm->mmap_lock taken and we cannot take the
-    // ISR lock. Therefore, we use a flag to notify the GPU fault handler that
-    // the fault buffer needs to be flushed, before servicing the faults that
-    // belong to the va_space. The bits are set and cleared atomically so no
-    // va_space lock is required.
-    uvm_processor_mask_t needs_fault_buffer_flush;
-
    // Mask of processors that are participating in system-wide atomics
    uvm_processor_mask_t system_wide_atomics_enabled_processors;

@@ -335,7 +324,7 @@ struct uvm_va_space_struct
    // Block context used for GPU unmap operations so that allocation is not
    // required on the teardown path. This can only be used while the VA space
    // lock is held in write mode. Access using uvm_va_space_block_context().
-    uvm_va_block_context_t va_block_context;
+    uvm_va_block_context_t *va_block_context;

    NvU64 initialization_flags;

@@ -541,7 +530,7 @@ void uvm_va_space_detach_all_user_channels(uvm_va_space_t *va_space, struct list

 // Returns whether peer access between these two GPUs has been enabled in this
 // VA space. Both GPUs must be registered in the VA space.
-bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2);
+bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1);

 // Returns the va_space this file points to. Returns NULL if this file
 // does not point to a va_space.
@@ -575,8 +564,8 @@ static uvm_va_block_context_t *uvm_va_space_block_context(uvm_va_space_t *va_spa
    if (mm)
        uvm_assert_mmap_lock_locked(mm);

-    uvm_va_block_context_init(&va_space->va_block_context, mm);
-    return &va_space->va_block_context;
+    uvm_va_block_context_init(va_space->va_block_context, mm);
+    return va_space->va_block_context;
 }

 // Retains the GPU VA space memory object. destroy_gpu_va_space and
--- a/kernel-open/nvidia-uvm/uvm_va_space_mm.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space_mm.c
@@ -215,7 +215,13 @@ bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)

    static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
    {
+#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
        .invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
+#elif defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
+        .arch_invalidate_secondary_tlbs = uvm_mmu_notifier_invalidate_range_ats,
+#else
+        #error One of invalidate_range/arch_invalid_secondary must be present
+#endif
    };

    static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)
@@ -310,17 +316,6 @@ void uvm_va_space_mm_unregister(uvm_va_space_t *va_space)
    if (!va_space_mm->mm)
        return;

-    // At this point the mm is still valid because uvm_mm_release()
-    // hasn't yet called mmput(). uvm_hmm_va_space_destroy() will kill
-    // all the va_blocks along with any associated gpu_chunks, so we
-    // need to make sure these chunks are free. However freeing them
-    // requires a valid mm so we can call migrate_vma_setup(), so we
-    // do that here.
-    // TODO: Bug 3902536: [UVM-HMM] add code to migrate GPU memory
-    // without having a va_block
-    if (uvm_hmm_is_enabled(va_space))
-        uvm_hmm_evict_va_blocks(va_space);
-
    if (uvm_va_space_mm_enabled(va_space)) {
        if (UVM_ATS_IBM_SUPPORTED_IN_DRIVER() && g_uvm_global.ats.enabled)
            uvm_mmu_notifier_unregister(va_space_mm);
--- a/kernel-open/nvidia-uvm/uvm_volta_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -145,10 +145,7 @@ static NvU64 small_half_pde_volta(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_volta(void *entry,
-                           uvm_mmu_page_table_alloc_t **phys_allocs,
-                           NvU32 depth,
-                           uvm_page_directory_t *child_dir)
+static void make_pde_volta(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
 {
    NvU32 entry_count = entries_per_index_volta(depth);
    NvU64 *entry_bits = (NvU64 *)entry;