535.113.01

2026-03-13 09:09:53 +00:00 · 2023-09-21 10:43:43 -07:00
parent a8e01be6b2
commit f59818b751
94 changed files with 2414 additions and 800 deletions
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.104.05\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.113.01\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@@ -207,9 +207,13 @@ enum os_pci_req_atomics_type {
    OS_INTF_PCIE_REQ_ATOMICS_128BIT
 };
 NV_STATUS   NV_API_CALL  os_enable_pci_req_atomics   (void *, enum os_pci_req_atomics_type);
+NV_STATUS   NV_API_CALL  os_get_numa_node_memory_usage (NvS32, NvU64 *, NvU64 *);
 NV_STATUS   NV_API_CALL  os_numa_add_gpu_memory      (void *, NvU64, NvU64, NvU32 *);
 NV_STATUS   NV_API_CALL  os_numa_remove_gpu_memory   (void *, NvU64, NvU64, NvU32); 
 NV_STATUS   NV_API_CALL  os_offline_page_at_address(NvU64 address);
+void*       NV_API_CALL  os_get_pid_info(void);
+void        NV_API_CALL  os_put_pid_info(void *pid_info);
+NV_STATUS   NV_API_CALL  os_find_ns_pid(void *pid_info, NvU32 *ns_pid);

 extern NvU32 os_page_size;
 extern NvU64 os_page_mask;
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -5636,23 +5636,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_GPIO_TO_IRQ_PRESENT" "" "functions"
        ;;

-        migrate_vma_setup)
-            #
-            # Determine if migrate_vma_setup() function is present
-            #
-            # migrate_vma_setup() function was added by commit
-            # a7d1f22bb74f32cf3cd93f52776007e161f1a738 ("mm: turn migrate_vma
-            # upside down) in v5.4.
-            # (2019-08-20).
-            CODE="
-            #include <linux/migrate.h>
-            int conftest_migrate_vma_setup(void) {
-                migrate_vma_setup();
-            }"
-
-            compile_check_conftest "$CODE" "NV_MIGRATE_VMA_SETUP_PRESENT" "" "functions"
-        ;;
-
        migrate_vma_added_flags)
            #
            # Determine if migrate_vma structure has flags
@@ -5743,23 +5726,25 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_IOASID_GET_PRESENT" "" "functions"
        ;;

-        mm_pasid_set)
+        mm_pasid_drop)
            #
-            # Determine if mm_pasid_set() function is present
+            # Determine if mm_pasid_drop() function is present
+            #
+            # Added by commit 701fac40384f ("iommu/sva: Assign a PASID to mm
+            # on PASID allocation and free it on mm exit") in v5.18.
+            # Moved to linux/iommu.h in commit cd3891158a77 ("iommu/sva: Move
+            # PASID helpers to sva code") in v6.4.
            #
-            # mm_pasid_set() function was added by commit
-            # 701fac40384f07197b106136012804c3cae0b3de (iommu/sva: Assign a
-            # PASID to mm on PASID allocation and free it on mm exit) in v5.18.
-            # (2022-02-15).
            CODE="
            #if defined(NV_LINUX_SCHED_MM_H_PRESENT)
            #include <linux/sched/mm.h>
            #endif
-            void conftest_mm_pasid_set(void) {
-                mm_pasid_set();
+            #include <linux/iommu.h>
+            void conftest_mm_pasid_drop(void) {
+                mm_pasid_drop();
            }"

-            compile_check_conftest "$CODE" "NV_MM_PASID_SET_PRESENT" "" "functions"
+            compile_check_conftest "$CODE" "NV_MM_PASID_DROP_PRESENT" "" "functions"
        ;;

        drm_crtc_state_has_no_vblank)
@@ -6341,6 +6326,22 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MEMPOLICY_HAS_HOME_NODE" "" "types"
        ;;

+        mpol_preferred_many_present)
+            #
+            # Determine if MPOL_PREFERRED_MANY enum is present or not
+            #
+            # Added by commit b27abaccf8e8b ("mm/mempolicy: add
+            # MPOL_PREFERRED_MANY for multiple preferred nodes") in
+            # v5.15
+            #
+            CODE="
+            #include <linux/mempolicy.h>
+            int mpol_preferred_many = MPOL_PREFERRED_MANY;
+            "
+
+            compile_check_conftest "$CODE" "NV_MPOL_PREFERRED_MANY_PRESENT" "" "types"
+        ;;
+
        mmu_interval_notifier)
            #
            # Determine if mmu_interval_notifier struct is present or not
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@@ -81,8 +81,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
@@ -110,6 +109,8 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_mm_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_pt_regs_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_unified_nodes
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_home_node
+NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018 NVIDIA Corporation
+    Copyright (c) 2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -149,7 +149,11 @@ static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,

    mode = vma_policy->mode;

-    if ((mode == MPOL_BIND) || (mode == MPOL_PREFERRED_MANY) || (mode == MPOL_PREFERRED)) {
+    if ((mode == MPOL_BIND)
+#if defined(NV_MPOL_PREFERRED_MANY_PRESENT)
+         || (mode == MPOL_PREFERRED_MANY)
+#endif
+         || (mode == MPOL_PREFERRED)) {
        int home_node = NUMA_NO_NODE;

 #if defined(NV_MEMPOLICY_HAS_HOME_NODE)
@@ -467,6 +471,10 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
            uvm_page_mask_and(write_fault_mask, write_fault_mask, read_fault_mask);
        else
            uvm_page_mask_zero(write_fault_mask);
+
+        // There are no pending faults beyond write faults to RO region.
+        if (uvm_page_mask_empty(read_fault_mask))
+            return status;
    }

    ats_batch_select_residency(gpu_va_space, vma, ats_context);
--- a/kernel-open/nvidia-uvm/uvm_ats_sva.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.h
@@ -32,19 +32,23 @@
 // For ATS support on aarch64, arm_smmu_sva_bind() is needed for
 // iommu_sva_bind_device() calls. Unfortunately, arm_smmu_sva_bind() is not
 // conftest-able. We instead look for the presence of ioasid_get() or
-// mm_pasid_set(). ioasid_get() was added in the same patch series as
-// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_set() was added in the
+// mm_pasid_drop(). ioasid_get() was added in the same patch series as
+// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_drop() was added in the
 // same patch as the removal of ioasid_get(). We assume the presence of
-// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_set(v5.18+) is
+// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_drop(v5.18+) is
 // present.
 //
 // arm_smmu_sva_bind() was added with commit
 // 32784a9562fb0518b12e9797ee2aec52214adf6f and ioasid_get() was added with
 // commit cb4789b0d19ff231ce9f73376a023341300aed96 (11/23/2020). Commit
 // 701fac40384f07197b106136012804c3cae0b3de (02/15/2022) removed ioasid_get()
-// and added mm_pasid_set().
-    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_SET_PRESENT))
-        #define UVM_ATS_SVA_SUPPORTED() 1
+// and added mm_pasid_drop().
+    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT))
+        #if defined(CONFIG_IOMMU_SVA)
+            #define UVM_ATS_SVA_SUPPORTED() 1
+        #else
+            #define UVM_ATS_SVA_SUPPORTED() 0
+        #endif
    #else
        #define UVM_ATS_SVA_SUPPORTED() 0
    #endif
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@@ -191,7 +191,7 @@ static NV_STATUS test_membar(uvm_gpu_t *gpu)

    for (i = 0; i < REDUCTIONS; ++i) {
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS + 1);
+        gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS);
    }

    // Without a sys membar the channel tracking semaphore can and does complete
@@ -577,7 +577,7 @@ static NV_STATUS test_semaphore_reduction_inc(uvm_gpu_t *gpu)

    for (i = 0; i < REDUCTIONS; i++) {
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, i+1);
+        gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, REDUCTIONS);
    }

    status = uvm_push_end_and_wait(&push);
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -697,9 +697,6 @@ static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_ty

 typedef enum
 {
-    // Fetch a batch of faults from the buffer.
-    FAULT_FETCH_MODE_BATCH_ALL,
-
    // Fetch a batch of faults from the buffer. Stop at the first entry that is
    // not ready yet
    FAULT_FETCH_MODE_BATCH_READY,
@@ -857,9 +854,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,
        // written out of order
        UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
            // We have some entry to work on. Let's do the rest later.
-            if (fetch_mode != FAULT_FETCH_MODE_ALL &&
-                fetch_mode != FAULT_FETCH_MODE_BATCH_ALL &&
-                fault_index > 0)
+            if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0)
                goto done;
        }

@@ -888,6 +883,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,

        current_entry->va_space = NULL;
        current_entry->filtered = false;
+        current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;

        if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) {
            UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count);
@@ -1378,7 +1374,10 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
        UVM_ASSERT(current_entry->fault_access_type ==
                   uvm_fault_access_type_mask_highest(current_entry->access_type_mask));

-        current_entry->is_fatal            = false;
+        // Unserviceable faults were already skipped by the caller. There are no
+        // unserviceable fault types that could be in the same VA block as a
+        // serviceable fault.
+        UVM_ASSERT(!current_entry->is_fatal);
        current_entry->is_throttled        = false;
        current_entry->is_invalid_prefetch = false;

@@ -1735,6 +1734,10 @@ static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space,
        uvm_fault_access_type_t access_type = current_entry->fault_access_type;
        bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);

+        // ATS faults can't be unserviceable, since unserviceable faults require
+        // GMMU PTEs.
+        UVM_ASSERT(!current_entry->is_fatal);
+
        i++;

        update_batch_and_notify_fault(gpu_va_space->gpu,
@@ -2044,8 +2047,10 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            uvm_va_space_mm_release_unlock(va_space, mm);
            mm = NULL;
            va_space = NULL;
+            status = NV_OK;
            continue;
        }
+
        if (status != NV_OK)
            goto fail;

@@ -2273,27 +2278,9 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
    return false;
 }

-typedef enum
-{
-    // Only cancel faults flagged as fatal
-    FAULT_CANCEL_MODE_FATAL,
-
-    // Cancel all faults in the batch unconditionally
-    FAULT_CANCEL_MODE_ALL,
-} fault_cancel_mode_t;
-
-// Cancel faults in the given fault service batch context. The function provides
-// two different modes depending on the value of cancel_mode:
-// - If cancel_mode == FAULT_CANCEL_MODE_FATAL, only faults flagged as fatal
-// will be cancelled. In this case, the reason reported to tools is the one
-// contained in the fault entry itself.
-// - If cancel_mode == FAULT_CANCEL_MODE_ALL, all faults will be cancelled
-// unconditionally. In this case, the reason reported to tools for non-fatal
-// faults is the one passed to this function.
-static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
-                                          uvm_fault_service_batch_context_t *batch_context,
-                                          fault_cancel_mode_t cancel_mode,
-                                          UvmEventFatalReason reason)
+// Cancel just the faults flagged as fatal in the given fault service batch
+// context.
+static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
 {
    NV_STATUS status = NV_OK;
    NV_STATUS fault_status;
@@ -2301,8 +2288,6 @@ static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
    NvU32 i;

    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
-    if (cancel_mode == FAULT_CANCEL_MODE_ALL)
-        UVM_ASSERT(reason != UvmEventFatalReasonInvalid);

    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
@@ -2320,12 +2305,66 @@ static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
            uvm_va_space_down_read(va_space);

            // We don't need to check whether a buffer flush is required
-            // (due to VA range destruction).
-            // - For cancel_mode == FAULT_CANCEL_MODE_FATAL, once a fault is
-            // flagged as fatal we need to cancel it, even if its VA range no
-            // longer exists.
-            // - For cancel_mode == FAULT_CANCEL_MODE_ALL we don't care about
-            // any of this, we just want to trigger RC in RM.
+            // (due to VA range destruction). Once a fault is flagged as fatal
+            // we need to cancel it, even if its VA range no longer exists.
+        }
+
+        // See the comment for the same check in cancel_faults_all
+        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id))
+            continue;
+
+        if (current_entry->is_fatal) {
+            status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
+            if (status != NV_OK)
+                break;
+        }
+    }
+
+    if (va_space != NULL)
+        uvm_va_space_up_read(va_space);
+
+    // See the comment on flushing in cancel_faults_all
+    fault_status = fault_buffer_flush_locked(gpu,
+                                             UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
+                                             UVM_FAULT_REPLAY_TYPE_START,
+                                             batch_context);
+
+    // We report the first encountered error.
+    if (status == NV_OK)
+        status = fault_status;
+
+    return status;
+}
+
+// Cancel all faults in the given fault service batch context, even those not
+// marked as fatal.
+static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
+                                   uvm_fault_service_batch_context_t *batch_context,
+                                   UvmEventFatalReason reason)
+{
+    NV_STATUS status = NV_OK;
+    NV_STATUS fault_status;
+    uvm_va_space_t *va_space = NULL;
+    NvU32 i;
+
+    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
+    UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
+
+    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+        uvm_fault_cancel_va_mode_t cancel_va_mode;
+
+        UVM_ASSERT(current_entry->va_space);
+
+        if (current_entry->va_space != va_space) {
+            // Fault on a different va_space, drop the lock of the old one...
+            if (va_space != NULL)
+                uvm_va_space_up_read(va_space);
+
+            va_space = current_entry->va_space;
+
+            // ... and take the lock of the new one
+            uvm_va_space_down_read(va_space);
        }

        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
@@ -2337,32 +2376,28 @@ static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
            continue;
        }

-        // Cancel the fault
-        if (cancel_mode == FAULT_CANCEL_MODE_ALL || current_entry->is_fatal) {
-            uvm_fault_cancel_va_mode_t cancel_va_mode = current_entry->replayable.cancel_va_mode;
-
-            // If cancelling unconditionally and the fault was not fatal,
-            // set the cancel reason passed to this function
-            if (!current_entry->is_fatal) {
-                current_entry->fatal_reason = reason;
-                cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
-            }
-
-            status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
-            if (status != NV_OK)
-                break;
+        // If the fault was already marked fatal, use its reason and cancel
+        // mode. Otherwise use the provided reason.
+        if (current_entry->is_fatal) {
+            UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
+            cancel_va_mode = current_entry->replayable.cancel_va_mode;
        }
+        else {
+            current_entry->fatal_reason = reason;
+            cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
+        }
+
+        status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
+        if (status != NV_OK)
+            break;
    }

    if (va_space != NULL)
        uvm_va_space_up_read(va_space);

-    // After cancelling the fatal faults, the fault buffer is flushed to remove
-    // any potential duplicated fault that may have been added while processing
-    // the faults in this batch. This flush also avoids doing unnecessary
-    // processing after the fatal faults have been cancelled, so all the rest
-    // are unlikely to remain after a replay because the context is probably in
-    // the process of dying.
+    // Because each cancel itself triggers a replay, there may be a large number
+    // of new duplicated faults in the buffer after cancelling all the known
+    // ones. Flushing the buffer discards them to avoid unnecessary processing.
    fault_status = fault_buffer_flush_locked(gpu,
                                             UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
                                             UVM_FAULT_REPLAY_TYPE_START,
@@ -2410,12 +2445,12 @@ static void cancel_fault_batch(uvm_gpu_t *gpu,
                               uvm_fault_service_batch_context_t *batch_context,
                               UvmEventFatalReason reason)
 {
-    if (gpu->parent->fault_cancel_va_supported) {
-        cancel_faults_precise_va(gpu, batch_context, FAULT_CANCEL_MODE_ALL, reason);
-        return;
-    }
-
-    cancel_fault_batch_tlb(gpu, batch_context, reason);
+    // Return code is ignored since we're on a global error path and wouldn't be
+    // able to recover anyway.
+    if (gpu->parent->fault_cancel_va_supported)
+        cancel_faults_all(gpu, batch_context, reason);
+    else
+        cancel_fault_batch_tlb(gpu, batch_context, reason);
 }


@@ -2582,12 +2617,8 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
 {
    UVM_ASSERT(batch_context->has_fatal_faults);
-    if (gpu->parent->fault_cancel_va_supported) {
-        return cancel_faults_precise_va(gpu,
-                                        batch_context,
-                                        FAULT_CANCEL_MODE_FATAL,
-                                        UvmEventFatalReasonInvalid);
-    }
+    if (gpu->parent->fault_cancel_va_supported)
+        return cancel_faults_precise_va(gpu, batch_context);

    return cancel_faults_precise_tlb(gpu, batch_context);
 }
--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2022 NVIDIA Corporation
+    Copyright (c) 2020-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -368,7 +368,10 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_hopper(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_hopper(void *entry,
+                            uvm_mmu_page_table_alloc_t **phys_allocs,
+                            NvU32 depth,
+                            uvm_page_directory_t *child_dir)
 {
    NvU32 entry_count = entries_per_index_hopper(depth);
    NvU64 *entry_bits = (NvU64 *)entry;
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@@ -153,10 +153,6 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 #define VM_MIXEDMAP    0x00000000
 #endif

-#if !defined(MPOL_PREFERRED_MANY)
-#define MPOL_PREFERRED_MANY    5
-#endif
-
 //
 // printk.h already defined pr_fmt, so we have to redefine it so the pr_*
 // routines pick up our version
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -106,7 +106,10 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_maxwell(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_maxwell(void *entry,
+                             uvm_mmu_page_table_alloc_t **phys_allocs,
+                             NvU32 depth,
+                             uvm_page_directory_t *child_dir)
 {
    NvU64 pde_bits = 0;
    UVM_ASSERT(depth == 0);
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@@ -51,7 +51,7 @@ typedef struct
 #if defined(CONFIG_MIGRATE_VMA_HELPER)
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #else
-#if defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_VMA_SETUP_PRESENT)
+#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #endif
 #endif
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -323,37 +323,153 @@ static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
    uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
 }

+static void pde_fill_cpu(uvm_page_tree_t *tree,
+                         uvm_page_directory_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr)
+{
+    NvU64 pde_data[2], entry_size;
+    NvU32 i;
+
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(directory->depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    for (i = 0; i < pde_count; i++) {
+        tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i]);
+
+        if (entry_size == sizeof(pde_data[0]))
+            uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
+        else
+            uvm_mmu_page_table_cpu_memset_16(tree->gpu, &directory->phys_alloc, start_index + i, pde_data, 1);
+    }
+}
+
+static void pde_fill_gpu(uvm_page_tree_t *tree,
+                         uvm_page_directory_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr,
+                         uvm_push_t *push)
+{
+    NvU64 pde_data[2], entry_size;
+    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->phys_alloc.addr);
+    NvU32 max_inline_entries;
+    uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
+    uvm_gpu_address_t inline_data_addr;
+    uvm_push_inline_data_t inline_data;
+    NvU32 entry_count, i, j;
+
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(directory->depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / entry_size;
+
+    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
+    else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
+
+    pde_entry_addr.address += start_index * entry_size;
+
+    for (i = 0; i < pde_count;) {
+        // All but the first memory operation can be pipelined. We respect the
+        // caller's pipelining settings for the first push.
+        if (i != 0)
+            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
+
+        entry_count = min(pde_count - i, max_inline_entries);
+
+        // No membar is needed until the last memory operation. Otherwise,
+        // use caller's membar flag.
+        if ((i + entry_count) < pde_count)
+            uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+        else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
+            uvm_push_set_flag(push, push_membar_flag);
+
+        uvm_push_inline_data_begin(push, &inline_data);
+        for (j = 0; j < entry_count; j++) {
+            tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i + j]);
+            uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
+        }
+        inline_data_addr = uvm_push_inline_data_end(&inline_data);
+
+        tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * entry_size);
+
+        i += entry_count;
+        pde_entry_addr.address += entry_size * entry_count;
+    }
+}
+
+// pde_fill() populates pde_count PDE entries (starting at start_index) with
+// the same mapping, i.e., with the same physical address (phys_addr).
+// pde_fill() is optimized for pde_count == 1, which is the common case. The
+// map_remap() function is the only case where pde_count > 1, only used on GA100
+// GPUs for 512MB page size mappings.
+static void pde_fill(uvm_page_tree_t *tree,
+                     uvm_page_directory_t *directory,
+                     NvU32 start_index,
+                     NvU32 pde_count,
+                     uvm_mmu_page_table_alloc_t **phys_addr,
+                     uvm_push_t *push)
+{
+    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, directory->depth, UVM_PAGE_SIZE_AGNOSTIC));
+
+    if (push)
+        pde_fill_gpu(tree, directory, start_index, pde_count, phys_addr, push);
+    else
+        pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
+}
+
 static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
-    NvU64 clear_bits[2];
-    uvm_mmu_mode_hal_t *hal = tree->hal;
+    NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);

-    if (dir->depth == tree->hal->page_table_depth(page_size)) {
-        *clear_bits = 0; // Invalid PTE
-    }
-    else {
-        // passing in NULL for the phys_allocs will mark the child entries as invalid
-        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
-        hal->make_pde(clear_bits, phys_allocs, dir->depth);
+    // Passing in NULL for the phys_allocs will mark the child entries as
+    // invalid.
+    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};

-        // Make sure that using only clear_bits[0] will work
-        UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
-    }
+    // Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
+    // than 512 entries. We initialize them all with the same clean PDE.
+    // Additionally, only ATS systems may require clean PDEs bit settings based
+    // on the mapping VA.
+    if (dir->depth == tree->hal->page_table_depth(page_size) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
+        NvU64 clear_bits[2];

-    // initialize the memory to a reasonable value
-    if (push) {
-        tree->gpu->parent->ce_hal->memset_8(push,
-                                            uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+        // If it is not a PTE, make a clean PDE.
+        if (dir->depth != tree->hal->page_table_depth(page_size)) {
+            tree->hal->make_pde(clear_bits, phys_allocs, dir->depth, dir->entries[0]);
+
+            // Make sure that using only clear_bits[0] will work.
+            UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
+        }
+        else {
+            *clear_bits = 0;
+        }
+
+        // Initialize the memory to a reasonable value.
+        if (push) {
+            tree->gpu->parent->ce_hal->memset_8(push,
+                                                uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+                                                *clear_bits,
+                                                dir->phys_alloc.size);
+        }
+        else {
+            uvm_mmu_page_table_cpu_memset_8(tree->gpu,
+                                            &dir->phys_alloc,
+                                            0,
                                            *clear_bits,
-                                            dir->phys_alloc.size);
+                                            dir->phys_alloc.size / sizeof(*clear_bits));
+        }
    }
    else {
-        uvm_mmu_page_table_cpu_memset_8(tree->gpu,
-                                        &dir->phys_alloc,
-                                        0,
-                                        *clear_bits,
-                                        dir->phys_alloc.size / sizeof(*clear_bits));
+        pde_fill(tree, dir, 0, entries_count, phys_allocs, push);
    }
+
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@@ -367,8 +483,10 @@ static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
    NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
    uvm_page_directory_t *dir;

-    // The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
-    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
+    // The page tree doesn't cache PTEs so space is not allocated for entries
+    // that are always PTEs.
+    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not
+    // page_size.
    if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
        entry_count = 0;
    else
@@ -409,108 +527,6 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }

-static void pde_fill_cpu(uvm_page_tree_t *tree,
-                         NvU32 depth,
-                         uvm_mmu_page_table_alloc_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr)
-{
-    NvU64 pde_data[2], entry_size;
-
-    UVM_ASSERT(uvm_mmu_use_cpu(tree));
-    entry_size = tree->hal->entry_size(depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    tree->hal->make_pde(pde_data, phys_addr, depth);
-
-    if (entry_size == sizeof(pde_data[0]))
-        uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
-    else
-        uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
-}
-
-static void pde_fill_gpu(uvm_page_tree_t *tree,
-                         NvU32 depth,
-                         uvm_mmu_page_table_alloc_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr,
-                         uvm_push_t *push)
-{
-    NvU64 pde_data[2], entry_size;
-    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
-
-    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
-
-    entry_size = tree->hal->entry_size(depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    tree->hal->make_pde(pde_data, phys_addr, depth);
-    pde_entry_addr.address += start_index * entry_size;
-
-    if (entry_size == sizeof(pde_data[0])) {
-        tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
-    }
-    else {
-        NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
-        uvm_gpu_address_t inline_data_addr;
-        uvm_push_inline_data_t inline_data;
-        NvU32 membar_flag = 0;
-        NvU32 i;
-
-        if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-            membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
-        else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-            membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
-
-        for (i = 0; i < pde_count;) {
-            NvU32 j;
-            NvU32 entry_count = min(pde_count - i, max_inline_entries);
-
-            uvm_push_inline_data_begin(push, &inline_data);
-            for (j = 0; j < entry_count; j++)
-                uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
-            inline_data_addr = uvm_push_inline_data_end(&inline_data);
-
-            // All but the first memcopy can be pipelined. We respect the
-            // caller's pipelining settings for the first push.
-            if (i != 0)
-                uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
-
-            // No membar is needed until the last copy. Otherwise, use
-            // caller's membar flag.
-            if (i + entry_count < pde_count)
-                uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-            else if (membar_flag)
-                uvm_push_set_flag(push, membar_flag);
-
-            tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
-
-            i += entry_count;
-            pde_entry_addr.address += sizeof(pde_data) * entry_count;
-        }
-    }
-}
-
-// pde_fill() populates pde_count PDE entries (starting at start_index) with
-// the same mapping, i.e., with the same physical address (phys_addr).
-static void pde_fill(uvm_page_tree_t *tree,
-                     NvU32 depth,
-                     uvm_mmu_page_table_alloc_t *directory,
-                     NvU32 start_index,
-                     NvU32 pde_count,
-                     uvm_mmu_page_table_alloc_t **phys_addr,
-                     uvm_push_t *push)
-{
-    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
-
-    if (push)
-        pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
-    else
-        pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
-}
-
 static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
                                            uvm_page_directory_t *parent,
                                            NvU32 index_in_parent)
@@ -540,7 +556,7 @@ static void pde_write(uvm_page_tree_t *tree,
            phys_allocs[i] = &entry->phys_alloc;
    }

-    pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
+    pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
 }

 static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
@@ -813,8 +829,11 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm

 static void map_remap_deinit(uvm_page_tree_t *tree)
 {
-    if (tree->map_remap.pde0.size)
-        phys_mem_deallocate(tree, &tree->map_remap.pde0);
+    if (tree->map_remap.pde0) {
+        phys_mem_deallocate(tree, &tree->map_remap.pde0->phys_alloc);
+        uvm_kvfree(tree->map_remap.pde0);
+        tree->map_remap.pde0 = NULL;
+    }

    if (tree->map_remap.ptes_invalid_4k.size)
        phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
@@ -839,10 +858,16 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
    // PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
    // return the PTE for the get_ptes()'s caller.
    if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
-        status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
-        if (status != NV_OK)
+        tree->map_remap.pde0 = allocate_directory(tree,
+                                                  UVM_PAGE_SIZE_2M,
+                                                  tree->hal->page_table_depth(UVM_PAGE_SIZE_2M),
+                                                  UVM_PMM_ALLOC_FLAGS_EVICT);
+        if (tree->map_remap.pde0 == NULL) {
+            status = NV_ERR_NO_MEMORY;
            goto error;
+        }
    }
+
    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
    if (status != NV_OK)
        goto error;
@@ -864,22 +889,23 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
        NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
        size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
-
-        // pde0 depth equals UVM_PAGE_SIZE_2M.
-        NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
-        NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);
+        NvU32 pde0_entries = tree->map_remap.pde0->phys_alloc.size / tree->hal->entry_size(tree->map_remap.pde0->depth);

        // The big-page entry is NULL which makes it an invalid entry.
        phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;

        // By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
        // sufficient when pde0 is allocated in VIDMEM.
-        if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
+        if (tree->map_remap.pde0->phys_alloc.addr.aperture == UVM_APERTURE_VID)
            uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

+        // This is an orphan directory, make_pde() requires a directory to
+        // compute the VA. The UVM depth map_remap() operates on is not in the
+        // range make_pde() must operate. We only need to supply the fields used
+        // by make_pde() to not access invalid memory addresses.
+
        pde_fill(tree,
-                 pde0_depth,
-                 &tree->map_remap.pde0,
+                 tree->map_remap.pde0,
                 0,
                 pde0_entries,
                 (uvm_mmu_page_table_alloc_t **)&phys_allocs,
@@ -1332,10 +1358,9 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
    if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

-    phys_alloc[0] = &tree->map_remap.pde0;
+    phys_alloc[0] = &tree->map_remap.pde0->phys_alloc;
    pde_fill(tree,
-             range->table->depth,
-             &range->table->phys_alloc,
+             range->table,
             range->start_index,
             range->entry_count,
             (uvm_mmu_page_table_alloc_t **)&phys_alloc,
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -219,7 +219,7 @@ struct uvm_mmu_mode_hal_struct
    // point to two items for dual PDEs).
    // any of allocs are allowed to be NULL, in which case they are to be
    // treated as empty.
-    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth);
+    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth, uvm_page_directory_t *child_dir);

    // size of an entry in a directory/table.  Generally either 8 or 16 bytes.
    // (in the case of Pascal dual PDEs)
@@ -229,7 +229,7 @@ struct uvm_mmu_mode_hal_struct
    NvU32 (*entries_per_index)(NvU32 depth);

    // For dual PDEs, this is ether 1 or 0, depending on the page size.
-    // This is used to index the host copy only.  GPU PDEs are always entirely
+    // This is used to index the host copy only. GPU PDEs are always entirely
    // re-written using make_pde.
    NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);

@@ -295,9 +295,8 @@ struct uvm_page_tree_struct

        // PDE0 where all big-page entries are invalid, and small-page entries
        // point to ptes_invalid_4k.
-        // pde0 is only used on Pascal-Ampere, i.e., they have the same PDE
-        // format.
-        uvm_mmu_page_table_alloc_t pde0;
+        // pde0 is used on Pascal+ GPUs, i.e., they have the same PDE format.
+        uvm_page_directory_t *pde0;
    } map_remap;

    // Tracker for all GPU operations on the tree
@@ -365,21 +364,32 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 // the same page size without an intervening put_ptes. To duplicate a subset of
 // an existing range or change the size of an existing range, use
 // uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
-NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
+                                 NvU32 page_size,
+                                 NvU64 start,
+                                 NvLength size,
+                                 uvm_pmm_alloc_flags_t pmm_flags,
+                                 uvm_page_table_range_t *range);

 // Same as uvm_page_tree_get_ptes(), but doesn't synchronize the GPU work.
 //
 // All pending operations can be waited on with uvm_page_tree_wait().
-NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
+                                       NvU32 page_size,
+                                       NvU64 start,
+                                       NvLength size,
+                                       uvm_pmm_alloc_flags_t pmm_flags,
+                                       uvm_page_table_range_t *range);

 // Returns a single-entry page table range for the addresses passed.
 // The size parameter must be a page size supported by this tree.
 // This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
 // page_size.
-NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *single);
+NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
+                                  NvU32 page_size,
+                                  NvU64 start,
+                                  uvm_pmm_alloc_flags_t pmm_flags,
+                                  uvm_page_table_range_t *single);

 // For a single-entry page table range, write the PDE (which could be a dual
 // PDE) to the GPU.
@@ -478,8 +488,8 @@ NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
 // new_range_vec will contain the upper portion of range_vec, starting at
 // new_end + 1.
 //
-// new_end + 1 is required to be within the address range of range_vec and be aligned to
-// range_vec's page_size.
+// new_end + 1 is required to be within the address range of range_vec and be
+// aligned to range_vec's page_size.
 //
 // On failure, the original range vector is left unmodified.
 NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
@@ -501,18 +511,22 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec);
 // for each offset.
 // The caller_data pointer is what the caller passed in as caller_data to
 // uvm_page_table_range_vec_write_ptes().
-typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec, NvU64 offset,
-        void *caller_data);
+typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec,
+                                                  NvU64 offset,
+                                                  void *caller_data);

-// Write all PTEs covered by the range vector using the given PTE making function.
+// Write all PTEs covered by the range vector using the given PTE making
+// function.
 //
 // After writing all the PTEs a TLB invalidate operation is performed including
 // the passed in tlb_membar.
 //
 // See comments about uvm_page_table_range_pte_maker_t for details about the
 // PTE making callback.
-NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar,
-        uvm_page_table_range_pte_maker_t pte_maker, void *caller_data);
+NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
+                                              uvm_membar_t tlb_membar,
+                                              uvm_page_table_range_pte_maker_t pte_maker,
+                                              void *caller_data);

 // Set all PTEs covered by the range vector to an empty PTE
 //
@@ -636,8 +650,9 @@ static NvU64 uvm_page_table_range_size(uvm_page_table_range_t *range)

 // Get the physical address of the entry at entry_index within the range
 // (counted from range->start_index).
-static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree, uvm_page_table_range_t *range,
-        size_t entry_index)
+static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree,
+                                                                 uvm_page_table_range_t *range,
+                                                                 size_t entry_index)
 {
    NvU32 entry_size = uvm_mmu_pte_size(tree, range->page_size);
    uvm_gpu_phys_address_t entry = range->table->phys_alloc.addr;
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -146,9 +146,15 @@ static void fake_tlb_invals_disable(void)
    g_fake_tlb_invals_tracking_enabled = false;
 }

-// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
-static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
-        NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
+// Fake TLB invalidate VA that just saves off the parameters so that they can be
+// verified later.
+static void fake_tlb_invalidate_va(uvm_push_t *push,
+                                   uvm_gpu_phys_address_t pdb,
+                                   NvU32 depth,
+                                   NvU64 base,
+                                   NvU64 size,
+                                   NvU32 page_size,
+                                   uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
        return;
@@ -210,8 +216,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
    }
    if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
        UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
-                expected_membar ? "a" : "no",
-                uvm_membar_string(g_last_fake_inval->membar));
+                       expected_membar ? "a" : "no",
+                       uvm_membar_string(g_last_fake_inval->membar));
        result = false;
    }

@@ -230,7 +236,8 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
    }
    if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
-                g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
+                       g_last_fake_inval->base,
+                       g_last_fake_inval->base + g_last_fake_inval->size);
        return false;
    }
    if (g_last_fake_inval->depth != expected_depth) {
@@ -247,15 +254,16 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
-        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
-                base, base + size);
+        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
        return false;
    }

    if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
-                base, base + size,
-                inval->base, inval->base + inval->size);
+                        base,
+                        base + size,
+                        inval->base,
+                        inval->base + inval->size);
        return false;
    }
    if (inval->depth != expected_depth) {
@@ -270,7 +278,13 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    return true;
 }

-static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
+static bool assert_invalidate_range(NvU64 base,
+                                    NvU64 size,
+                                    NvU32 page_size,
+                                    bool allow_inval_all,
+                                    NvU32 range_depth,
+                                    NvU32 all_depth,
+                                    bool expected_membar)
 {
    NvU32 i;

@@ -488,7 +502,6 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
    return NV_OK;
 }

-
 static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -842,6 +855,7 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
    TEST_CHECK_RET(range2.entry_count == 256);
    TEST_CHECK_RET(range2.table->ref_count == 512);
    TEST_CHECK_RET(range1.table == range2.table);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range1.start_index == 256);
@@ -871,6 +885,7 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
    TEST_CHECK_RET(range64k.entry_count == 16);
    TEST_CHECK_RET(range64k.table->ref_count == 16);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range64k.start_index == 16);
@@ -1030,10 +1045,13 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)

    // Depth 4
    NvU64 extent_pte = UVM_PAGE_SIZE_2M;
+
    // Depth 3
    NvU64 extent_pde0 = extent_pte * (1ull << 8);
+
    // Depth 2
    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
+
    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

@@ -1081,7 +1099,11 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
+static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
+                                                 NvU64 base,
+                                                 NvU64 size,
+                                                 NvU32 min_page_size,
+                                                 NvU32 max_page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_push_t push;
@@ -1205,7 +1227,11 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
            NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
            if (*pte != expected_pte) {
                UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
-                        *pte, expected_pte, offset, range_vec->start, range_vec->size);
+                               *pte,
+                               expected_pte,
+                               offset,
+                               range_vec->start,
+                               range_vec->size);
                return false;
            }
            offset += range_vec->page_size;
@@ -1226,7 +1252,11 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
    TEST_CHECK_RET(data.status == NV_OK);
    TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
    TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
-            range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
+                                                    range_vec->start,
+                                                    range_vec->size,
+                                                    range_vec->page_size,
+                                                    page_table_depth,
+                                                    membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));

    fake_tlb_invals_disable();
@@ -1249,7 +1279,11 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
    return NV_OK;
 }

-static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
+static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
+                                       NvU64 start,
+                                       NvU64 size,
+                                       NvU32 page_size,
+                                       uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@@ -1552,17 +1586,17 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)

        memset(phys_allocs, 0, sizeof(phys_allocs));

-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
        TEST_CHECK_RET(pde_bits == 0x0L);

        phys_allocs[0] = &alloc_sys;
        phys_allocs[1] = &alloc_vid;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
        TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);

        phys_allocs[0] = &alloc_vid;
        phys_allocs[1] = &alloc_sys;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
        TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);

        for (j = 0; j <= 2; j++) {
@@ -1632,6 +1666,7 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
+
    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);
@@ -1639,31 +1674,31 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@@ -1727,36 +1762,36 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // NO_ATS PDE1 (depth 2)
    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 2);
+    hal->make_pde(pde_bits, phys_allocs, 2, NULL);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
    else
@@ -1805,32 +1840,32 @@ static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0);

    // Cleared PDEs work as expected for big and small PDEs.
    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 4);
+    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs, uncached ATS allowed.
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);

    // Dual PDEs, uncached.
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 4);
+    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 4);
+    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
@@ -2303,7 +2338,8 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    gpu->parent = parent_gpu;

    // At least test_tlb_invalidates() relies on global state
-    // (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
+    // (g_tlb_invalidate_*) so make sure only one test instance can run at a
+    // time.
    uvm_mutex_lock(&g_uvm_global.global_lock);

    // Allocate the fake TLB tracking state. Notably tests still need to enable
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2020 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -140,7 +140,10 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_pascal(void *entry,
+                            uvm_mmu_page_table_alloc_t **phys_allocs,
+                            NvU32 depth,
+                            uvm_page_directory_t *child_dir)
 {
    NvU32 entry_count = entries_per_index_pascal(depth);
    NvU64 *entry_bits = (NvU64 *)entry;
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -10155,6 +10155,30 @@ static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
        return preferred_location;

+    // Check if we should map the closest resident processor remotely on remote CPU fault
+    //
+    // When faulting on CPU, there's a linux process on behalf of it, which is associated
+    // with a unique VM pointed by current->mm. A block of memory residing on GPU is also
+    // associated with VM, pointed by va_block_context->mm. If they match, it's a regular
+    // (local) fault, and we may want to migrate a page from GPU to CPU.
+    // If it's a 'remote' fault, i.e. linux process differs from one associated with block
+    // VM, we might preserve residence.
+    //
+    // Establishing a remote fault without access counters means the memory could stay in
+    // the wrong spot for a long time, which is why we prefer to avoid creating remote
+    // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
+    // in place for NIC accesses.
+    //
+    // The logic that's used to detect remote faulting also keeps memory in place for
+    // ptrace accesses. We would prefer to control those policies separately, but the
+    // NIC case takes priority.
+    if (UVM_ID_IS_CPU(processor_id) &&
+        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
+        va_block_context->mm != current->mm) {
+        UVM_ASSERT(va_block_context->mm != NULL);
+        return closest_resident_processor;
+    }
+
    // If the page is resident on a processor other than the preferred location,
    // or the faulting processor can't access the preferred location, we select
    // the faulting processor as the new residency.
--- a/kernel-open/nvidia-uvm/uvm_va_policy.h
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.h
@@ -193,7 +193,8 @@ uvm_va_policy_node_t *uvm_va_policy_node_iter_next(uvm_va_block_t *va_block, uvm
    for ((node) = uvm_va_policy_node_iter_first((va_block), (start), (end)),  \
         (next) = uvm_va_policy_node_iter_next((va_block), (node), (end));    \
         (node);                                                              \
-         (node) = (next))
+         (node) = (next),                                                     \
+         (next) = uvm_va_policy_node_iter_next((va_block), (node), (end)))

 // Returns the first policy in the range [start, end], if any.
 // Locking: The va_block lock must be held.
--- a/kernel-open/nvidia-uvm/uvm_volta_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -145,7 +145,10 @@ static NvU64 small_half_pde_volta(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_volta(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_volta(void *entry,
+                           uvm_mmu_page_table_alloc_t **phys_allocs,
+                           NvU32 depth,
+                           uvm_page_directory_t *child_dir)
 {
    NvU32 entry_count = entries_per_index_volta(depth);
    NvU64 *entry_bits = (NvU64 *)entry;
--- a/kernel-open/nvidia/nvlink_export.h
+++ b/kernel-open/nvidia/nvlink_export.h
@@ -46,6 +46,11 @@ NvlStatus nvlink_lib_unload(void);
 */
 NvlStatus nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params *ctrl_params);

+/*
+* Gets number of devices with type deviceType
+*/
+NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
+
 #ifdef __cplusplus
 }
 #endif
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -28,6 +28,11 @@

 #include "nv-time.h"

+#include <linux/mmzone.h>
+#include <linux/numa.h>
+
+#include <linux/pid.h>
+
 extern char *NVreg_TemporaryFilePath;

 #define MAX_ERROR_STRING 512
@@ -2122,6 +2127,43 @@ void NV_API_CALL os_nv_cap_close_fd
    nv_cap_close_fd(fd);
 }

+/*
+ * Reads the total memory and free memory of a NUMA node from the kernel.
+ */
+NV_STATUS NV_API_CALL os_get_numa_node_memory_usage
+(
+    NvS32 node_id,
+    NvU64 *free_memory_bytes,
+    NvU64 *total_memory_bytes
+)
+{
+    struct pglist_data *pgdat;
+    struct zone *zone;
+    NvU32 zone_id;
+
+    if (node_id >= MAX_NUMNODES)
+    {
+        nv_printf(NV_DBG_ERRORS, "Invalid NUMA node ID\n");
+        return NV_ERR_INVALID_ARGUMENT;
+    }
+
+    pgdat = NODE_DATA(node_id);
+
+    *free_memory_bytes = 0;
+    *total_memory_bytes = 0;
+
+    for (zone_id = 0; zone_id < MAX_NR_ZONES; zone_id++)
+    {
+        zone = &(pgdat->node_zones[zone_id]);
+        if (!populated_zone(zone))
+            continue;
+        *free_memory_bytes += (zone_page_state_snapshot(zone, NR_FREE_PAGES) * PAGE_SIZE);
+        *total_memory_bytes += (zone->present_pages * PAGE_SIZE);
+    }
+
+    return NV_OK;
+}
+
 typedef struct os_numa_gpu_mem_hotplug_notifier_s
 {
    NvU64 start_pa;
@@ -2373,3 +2415,28 @@ NV_STATUS NV_API_CALL os_offline_page_at_address
 #endif
 }

+void* NV_API_CALL os_get_pid_info(void)
+{
+    return get_task_pid(current, PIDTYPE_PID);
+}
+
+void NV_API_CALL os_put_pid_info(void *pid_info)
+{
+    if (pid_info != NULL)
+        put_pid(pid_info);
+}
+
+NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid)
+{
+    if ((pid_info == NULL) || (ns_pid == NULL))
+        return NV_ERR_INVALID_ARGUMENT;
+
+    *ns_pid = pid_vnr((struct pid *)pid_info);
+
+    // The call returns 0 if the PID is not found in the current ns
+    if (*ns_pid == 0)
+        return NV_ERR_OBJECT_NOT_FOUND;
+
+    return NV_OK;
+}
+