535.43.02

2026-02-27 02:13:59 +00:00 · 2023-05-30 10:11:36 -07:00
parent 6dd092ddb7
commit eb5c7665a1
1403 changed files with 295367 additions and 86235 deletions
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -34,6 +34,7 @@
 #include "uvm_mem.h"
 #include "uvm_va_space.h"

+#include <linux/mm.h>

 // The page tree has 6 levels on Hopper+ GPUs, and the root is never freed by a
 // normal 'put' operation which leaves a maximum of 5 levels.
@@ -102,7 +103,7 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,
    NvU64 dma_addr;
    unsigned long flags = __GFP_ZERO;
    uvm_memcg_context_t memcg_context;
-    uvm_va_space_t *va_space;
+    uvm_va_space_t *va_space = NULL;
    struct mm_struct *mm = NULL;

    if (tree->type == UVM_PAGE_TREE_TYPE_USER && tree->gpu_va_space && UVM_CGROUP_ACCOUNTING_SUPPORTED()) {
@@ -244,6 +245,84 @@ static void page_table_range_init(uvm_page_table_range_t *range,
    dir->ref_count += range->entry_count;
 }

+static bool uvm_mmu_use_cpu(uvm_page_tree_t *tree)
+{
+    // When physical CE writes can't be used for vidmem we use a flat virtual
+    // mapping instead. The GPU PTEs for that flat mapping have to be
+    // bootstrapped using the CPU.
+    return tree->location != UVM_APERTURE_SYS &&
+           !tree->gpu->parent->ce_phys_vidmem_write_supported &&
+           !tree->gpu->static_flat_mapping.ready;
+}
+
+// uvm_mmu_page_table_page() and the uvm_mmu_page_table_cpu_* family of
+// functions can only be used when uvm_mmu_use_cpu() returns true, which implies
+// a coherent system.
+
+static struct page *uvm_mmu_page_table_page(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
+{
+    // All platforms that require CPU PTE writes for bootstrapping can fit
+    // tables within a page.
+    UVM_ASSERT(phys_alloc->size <= PAGE_SIZE);
+
+    if (phys_alloc->addr.aperture == UVM_APERTURE_SYS)
+        return phys_alloc->handle.page;
+
+    return uvm_gpu_chunk_to_page(&gpu->pmm, phys_alloc->handle.chunk);
+}
+
+static void *uvm_mmu_page_table_cpu_map(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
+{
+    struct page *page = uvm_mmu_page_table_page(gpu, phys_alloc);
+    NvU64 page_offset = offset_in_page(phys_alloc->addr.address);
+    return (char *)kmap(page) + page_offset;
+}
+
+static void uvm_mmu_page_table_cpu_unmap(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
+{
+    kunmap(uvm_mmu_page_table_page(gpu, phys_alloc));
+}
+
+static void uvm_mmu_page_table_cpu_memset_8(uvm_gpu_t *gpu,
+                                            uvm_mmu_page_table_alloc_t *phys_alloc,
+                                            NvU32 start_index,
+                                            NvU64 pattern,
+                                            NvU32 num_entries)
+{
+    NvU64 *ptr = uvm_mmu_page_table_cpu_map(gpu, phys_alloc);
+    size_t i;
+
+    UVM_ASSERT(IS_ALIGNED((uintptr_t)ptr, sizeof(*ptr)));
+    UVM_ASSERT((start_index + num_entries) * sizeof(*ptr) <= phys_alloc->size);
+
+    for (i = 0; i < num_entries; i++)
+        ptr[start_index + i] = pattern;
+
+    uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
+}
+
+static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
+                                             uvm_mmu_page_table_alloc_t *phys_alloc,
+                                             NvU32 start_index,
+                                             NvU64 *pattern,
+                                             NvU32 num_entries)
+{
+    struct
+    {
+        NvU64 u0, u1;
+    } *ptr;
+    size_t i;
+
+    ptr = uvm_mmu_page_table_cpu_map(gpu, phys_alloc);
+    UVM_ASSERT(IS_ALIGNED((uintptr_t)ptr, sizeof(*ptr)));
+    UVM_ASSERT((start_index + num_entries) * sizeof(*ptr) <= phys_alloc->size);
+
+    for (i = 0; i < num_entries; i++)
+        memcpy(&ptr[start_index + i], pattern, sizeof(*ptr));
+
+    uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
+}
+
 static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
    NvU64 clear_bits[2];
@@ -262,10 +341,19 @@ static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_direc
    }

    // initialize the memory to a reasonable value
-    tree->gpu->parent->ce_hal->memset_8(push,
-                                        uvm_gpu_address_from_phys(dir->phys_alloc.addr),
+    if (push) {
+        tree->gpu->parent->ce_hal->memset_8(push,
+                                            uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+                                            *clear_bits,
+                                            dir->phys_alloc.size);
+    }
+    else {
+        uvm_mmu_page_table_cpu_memset_8(tree->gpu,
+                                        &dir->phys_alloc,
+                                        0,
                                        *clear_bits,
-                                        dir->phys_alloc.size);
+                                        dir->phys_alloc.size / sizeof(*clear_bits));
+    }
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@@ -321,25 +409,44 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }

-// pde_fill() populates pde_count PDE entries (starting at start_index) with
-// the same mapping, i.e., with the same physical address (phys_addr).
-static void pde_fill(uvm_page_tree_t *tree,
-                     NvU32 depth,
-                     uvm_mmu_page_table_alloc_t *directory,
-                     NvU32 start_index,
-                     NvU32 pde_count,
-                     uvm_mmu_page_table_alloc_t **phys_addr,
-                     uvm_push_t *push)
+static void pde_fill_cpu(uvm_page_tree_t *tree,
+                         NvU32 depth,
+                         uvm_mmu_page_table_alloc_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr)
 {
    NvU64 pde_data[2], entry_size;
-    uvm_gpu_address_t pde_entry_addr;

-    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+    entry_size = tree->hal->entry_size(depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    tree->hal->make_pde(pde_data, phys_addr, depth);
+
+    if (entry_size == sizeof(pde_data[0]))
+        uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
+    else
+        uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
+}
+
+static void pde_fill_gpu(uvm_page_tree_t *tree,
+                         NvU32 depth,
+                         uvm_mmu_page_table_alloc_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr,
+                         uvm_push_t *push)
+{
+    NvU64 pde_data[2], entry_size;
+    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
+
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
    entry_size = tree->hal->entry_size(depth);
    UVM_ASSERT(sizeof(pde_data) >= entry_size);

    tree->hal->make_pde(pde_data, phys_addr, depth);
-    pde_entry_addr = uvm_gpu_address_from_phys(directory->addr);
    pde_entry_addr.address += start_index * entry_size;

    if (entry_size == sizeof(pde_data[0])) {
@@ -386,6 +493,24 @@ static void pde_fill(uvm_page_tree_t *tree,
    }
 }

+// pde_fill() populates pde_count PDE entries (starting at start_index) with
+// the same mapping, i.e., with the same physical address (phys_addr).
+static void pde_fill(uvm_page_tree_t *tree,
+                     NvU32 depth,
+                     uvm_mmu_page_table_alloc_t *directory,
+                     NvU32 start_index,
+                     NvU32 pde_count,
+                     uvm_mmu_page_table_alloc_t **phys_addr,
+                     uvm_push_t *push)
+{
+    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
+
+    if (push)
+        pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
+    else
+        pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
+}
+
 static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
                                            uvm_page_directory_t *parent,
                                            NvU32 index_in_parent)
@@ -426,7 +551,11 @@ static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU
    dir->ref_count--;
 }

-static void pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size, void *push)
+static void pde_clear(uvm_page_tree_t *tree,
+                      uvm_page_directory_t *dir,
+                      NvU32 entry_index,
+                      NvU32 page_size,
+                      uvm_push_t *push)
 {
    host_pde_clear(tree, dir, entry_index, page_size);
    pde_write(tree, dir, entry_index, false, push);
@@ -492,12 +621,62 @@ static NV_STATUS page_tree_end_and_wait(uvm_page_tree_t *tree, uvm_push_t *push)
    return NV_OK;
 }

-// initialize new page tables and insert them into the tree
-static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
-                                 NvS32 invalidate_depth,
-                                 NvU32 used_count,
-                                 uvm_page_directory_t **dirs_used)
+static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
+                                     NvU32 page_size,
+                                     NvS32 invalidate_depth,
+                                     NvU32 used_count,
+                                     uvm_page_directory_t **dirs_used)
+{
+    NvS32 i;
+    uvm_push_t push;
+    NV_STATUS status;
+
+    uvm_assert_mutex_locked(&tree->lock);
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+
+    if (used_count == 0)
+        return NV_OK;
+
+    status = uvm_tracker_wait(&tree->tracker);
+    if (status != NV_OK)
+        return status;
+
+    for (i = 0; i < used_count; i++)
+        phys_mem_init(tree, page_size, dirs_used[i], NULL);
+
+    // Only a single membar is needed between the memsets of the page tables
+    // and the writes of the PDEs pointing to those page tables.
+    mb();
+
+    // write entries bottom up, so that they are valid once they're inserted
+    // into the tree
+    for (i = used_count - 1; i >= 0; i--)
+        pde_write(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, false, NULL);
+
+    // A CPU membar is needed between the PDE writes and the subsequent TLB
+    // invalidate. Work submission guarantees such a membar.
+    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "%u dirs", used_count);
+    if (status != NV_OK)
+        return status;
+
+    UVM_ASSERT(invalidate_depth >= 0);
+
+    // See the comments in write_gpu_state_gpu()
+    tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
+                                                    uvm_page_tree_pdb(tree)->addr,
+                                                    invalidate_depth,
+                                                    UVM_MEMBAR_NONE);
+    page_tree_end(tree, &push);
+    page_tree_tracker_overwrite_with_push(tree, &push);
+
+    return NV_OK;
+}
+
+static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,
+                                     NvU32 page_size,
+                                     NvS32 invalidate_depth,
+                                     NvU32 used_count,
+                                     uvm_page_directory_t **dirs_used)
 {
    NvS32 i;
    uvm_push_t push;
@@ -508,11 +687,12 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
    uvm_membar_t membar_after_writes = UVM_MEMBAR_GPU;

    uvm_assert_mutex_locked(&tree->lock);
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));

    if (used_count == 0)
        return NV_OK;

-    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "write_gpu_state: %u dirs", used_count);
+    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "%u dirs", used_count);
    if (status != NV_OK)
        return status;

@@ -533,15 +713,15 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,

    // Only a single membar is needed between the memsets of the page tables
    // and the writes of the PDEs pointing to those page tables.
-    // The membar can be local if all of the page tables and PDEs are in GPU memory,
-    // but must be a sysmembar if any of them are in sysmem.
-    tree->gpu->parent->host_hal->wait_for_idle(&push);
-    uvm_hal_membar(tree->gpu, &push, membar_after_writes);
+    // The membar can be local if all of the page tables and PDEs are in GPU
+    // memory, but must be a sysmembar if any of them are in sysmem.
+    uvm_hal_wfi_membar(&push, membar_after_writes);

    // Reset back to a local membar by default
    membar_after_writes = UVM_MEMBAR_GPU;

-    // write entries bottom up, so that they are valid once they're inserted into the tree
+    // write entries bottom up, so that they are valid once they're inserted
+    // into the tree
    for (i = used_count - 1; i >= 0; i--) {
        uvm_page_directory_t *dir = dirs_used[i];

@@ -553,18 +733,19 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,

        // If any of the written PDEs is in sysmem, a sysmembar is needed before
        // the TLB invalidate.
-        // Notably sysmembar is needed even though the writer (CE) and reader (MMU) are
-        // on the same GPU, because CE physical writes take the L2 bypass path.
+        // Notably sysmembar is needed even though the writer (CE) and reader
+        // (MMU) are on the same GPU, because CE physical writes take the L2
+        // bypass path.
        if (dir->host_parent->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
            membar_after_writes = UVM_MEMBAR_SYS;
    }

-    tree->gpu->parent->host_hal->wait_for_idle(&push);
-    uvm_hal_membar(tree->gpu, &push, membar_after_writes);
+    uvm_hal_wfi_membar(&push, membar_after_writes);

    UVM_ASSERT(invalidate_depth >= 0);

-    // Upgrades don't have to flush out accesses, so no membar is needed on the TLB invalidate.
+    // Upgrades don't have to flush out accesses, so no membar is needed on the
+    // TLB invalidate.
    tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
                                                    uvm_page_tree_pdb(tree)->addr,
                                                    invalidate_depth,
@@ -582,6 +763,19 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
    return NV_OK;
 }

+// initialize new page tables and insert them into the tree
+static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
+                                 NvU32 page_size,
+                                 NvS32 invalidate_depth,
+                                 NvU32 used_count,
+                                 uvm_page_directory_t **dirs_used)
+{
+    if (uvm_mmu_use_cpu(tree))
+        return write_gpu_state_cpu(tree, page_size, invalidate_depth, used_count, dirs_used);
+    else
+        return write_gpu_state_gpu(tree, page_size, invalidate_depth, used_count, dirs_used);
+}
+
 static void free_unused_directories(uvm_page_tree_t *tree,
                                    NvU32 used_count,
                                    uvm_page_directory_t **dirs_used,
@@ -633,6 +827,8 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
    uvm_pte_batch_t batch;
    NvU32 entry_size;

+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
    // Allocate the ptes_invalid_4k.
    status = allocate_page_table(tree, UVM_PAGE_SIZE_4K, &tree->map_remap.ptes_invalid_4k);
    if (status != NV_OK)
@@ -718,6 +914,9 @@ error:
 //
 // In SR-IOV heavy the the page tree must be in vidmem, to prevent guest drivers
 // from updating GPU page tables without hypervisor knowledge.
+// When the Confidential Computing feature is enabled, all kernel
+// allocations must be made in the CPR of vidmem. This is a hardware security
+// constraint.
 //             Inputs                                     Outputs
 // init location | uvm_page_table_location || tree->location | tree->location_sys_fallback
 //  -------------|-------------------------||----------------|----------------
@@ -734,7 +933,8 @@ static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t locatio
                   (location == UVM_APERTURE_DEFAULT),
                   "Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);

-    should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu);
+    should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu)
+                                || uvm_conf_computing_mode_enabled(tree->gpu);

    // The page tree of a "fake" GPU used during page tree testing can be in
    // sysmem even if should_location_be_vidmem is true. A fake GPU can be
@@ -798,6 +998,11 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
            return status;
    }

+    if (uvm_mmu_use_cpu(tree)) {
+        phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, NULL);
+        return NV_OK;
+    }
+
    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "init page tree");
    if (status != NV_OK)
        return status;
@@ -858,7 +1063,7 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
    uvm_page_directory_t *free_queue[MAX_OPERATION_DEPTH];
    uvm_page_directory_t *dir = range->table;
    uvm_push_t push;
-    NV_STATUS status;
+    NV_STATUS status = NV_OK;
    NvU32 invalidate_depth = 0;

    // The logic of what membar is needed when is pretty subtle, please refer to
@@ -880,34 +1085,58 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
        uvm_membar_t this_membar;

        if (free_count == 0) {
+            if (uvm_mmu_use_cpu(tree))
+                status = uvm_tracker_wait(&tree->tracker);

-            // begin a push which will be submitted before the memory gets freed
-            status = page_tree_begin_acquire(tree, &tree->tracker, &push, "put ptes: start: %u, count: %u",
-                                     range->start_index, range->entry_count);
-            // Failure to get a push can only happen if we've hit a fatal UVM
-            // channel error. We can't perform the unmap, so just leave things
-            // in place for debug.
+            if (status == NV_OK) {
+                // Begin a push which will be submitted before the memory gets
+                // freed.
+                //
+                // When writing with the CPU we don't strictly need to begin
+                // this push until after the writes are done, but doing it here
+                // doesn't hurt and makes the function's logic simpler.
+                status = page_tree_begin_acquire(tree,
+                                                 &tree->tracker,
+                                                 &push,
+                                                 "put ptes: start: %u, count: %u",
+                                                 range->start_index,
+                                                 range->entry_count);
+            }
+
+            // Failure to wait for a tracker or get a push can only happen if
+            // we've hit a fatal UVM channel error. We can't perform the unmap,
+            // so just leave things in place for debug.
            if (status != NV_OK) {
                UVM_ASSERT(status == uvm_global_get_status());
                dir->ref_count += range->entry_count;
-                uvm_mutex_unlock(&tree->lock);
-                return;
+                goto done;
            }
        }

-        // All writes can be pipelined as put_ptes() cannot be called with any
-        // operations pending on the affected PTEs and PDEs.
-        uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
+        if (uvm_mmu_use_cpu(tree)) {
+            pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, NULL);
+        }
+        else {
+            // All writes can be pipelined as put_ptes() cannot be called with
+            // any operations pending on the affected PTEs and PDEs.
+            uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);

-        // Don't issue any membars as part of the clear, a single membar will be
-        // done below before the invalidate.
-        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, &push);
+            // Don't issue any membars as part of the clear, a single membar
+            // will be done below before the invalidate.
+            uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+            pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, &push);
+        }

        invalidate_depth = dir->host_parent->depth;

-        // Take the membar with the widest scope of any of the pointed-to PDEs
-        this_membar = uvm_hal_downgrade_membar_type(tree->gpu, dir->phys_alloc.addr.aperture == UVM_APERTURE_VID);
+        // If we're using the CPU to do the write a SYS membar is required.
+        // Otherwise, take the membar with the widest scope of any of the
+        // pointed-to PDEs.
+        if (uvm_mmu_use_cpu(tree))
+            this_membar = UVM_MEMBAR_SYS;
+        else
+            this_membar = uvm_hal_downgrade_membar_type(tree->gpu, dir->phys_alloc.addr.aperture == UVM_APERTURE_VID);
+
        membar_after_invalidate = max(membar_after_invalidate, this_membar);

        // If any of the cleared PDEs were in sysmem then a SYS membar is
@@ -923,23 +1152,28 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
        dir = parent;
    }

-    if (free_count == 0) {
-        uvm_mutex_unlock(&tree->lock);
-        return;
-    }
+    if (free_count == 0)
+        goto done;
+
+    if (uvm_mmu_use_cpu(tree))
+        mb();
+    else
+        uvm_hal_wfi_membar(&push, membar_after_pde_clears);

-    tree->gpu->parent->host_hal->wait_for_idle(&push);
-    uvm_hal_membar(tree->gpu, &push, membar_after_pde_clears);
    tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
                                                    uvm_page_tree_pdb(tree)->addr,
                                                    invalidate_depth,
                                                    membar_after_invalidate);

-    // We just did the appropriate membar above, no need for another one in push_end().
-    // At least currently as if the L2 bypass path changes to only require a GPU
-    // membar between PDE write and TLB invalidate, we'll need to push a
-    // sysmembar so the end-of-push semaphore is ordered behind the PDE writes.
-    uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+    if (!uvm_mmu_use_cpu(tree)) {
+        // We just did the appropriate membar above, no need for another one in
+        // push_end(). If the L2 bypass path changes to only require a GPU
+        // membar between PDE write and TLB invalidate, we'll need to push a
+        // sysmembar so the end-of-push semaphore is ordered behind the PDE
+        // writes.
+        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+    }
+
    page_tree_end(tree, &push);
    page_tree_tracker_overwrite_with_push(tree, &push);

@@ -949,6 +1183,7 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
        uvm_kvfree(free_queue[i]);
    }

+done:
    uvm_mutex_unlock(&tree->lock);
 }

@@ -1255,19 +1490,22 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,

    UVM_ASSERT(pte_dir->depth == tree->hal->page_table_depth(page_size));

+    // The flat mappings should always be set up when executing this path
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %u", page_size);
    if (status != NV_OK)
        return status;

    tree->gpu->parent->ce_hal->memset_8(&push,
-                                        uvm_gpu_address_from_phys(pte_dir->phys_alloc.addr),
+                                        uvm_mmu_gpu_address(tree->gpu, pte_dir->phys_alloc.addr),
                                        tree->hal->poisoned_pte(),
                                        pte_dir->phys_alloc.size);

    // If both the new PTEs and the parent PDE are in vidmem, then a GPU-
    // local membar is enough to keep the memset of the PTEs ordered with
    // any later write of the PDE. Otherwise we need a sysmembar. See the
-    // comments in write_gpu_state.
+    // comments in write_gpu_state_gpu.
    if (pte_dir->phys_alloc.addr.aperture == UVM_APERTURE_VID &&
        parent->phys_alloc.addr.aperture == UVM_APERTURE_VID)
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
@@ -1527,7 +1765,43 @@ NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range
    return NV_OK;
 }

-NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
+static NV_STATUS uvm_page_table_range_vec_clear_ptes_cpu(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
+{
+    uvm_page_tree_t *tree = range_vec->tree;
+    NvU32 entry_size = uvm_mmu_pte_size(tree, range_vec->page_size);
+    NvU64 invalid_ptes[2] = {0, 0};
+    uvm_push_t push;
+    NV_STATUS status;
+    size_t i;
+
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+
+    for (i = 0; i < range_vec->range_count; ++i) {
+        uvm_page_table_range_t *range = &range_vec->ranges[i];
+        uvm_mmu_page_table_alloc_t *dir = &range->table->phys_alloc;
+
+        if (entry_size == 8)
+            uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, range->start_index, invalid_ptes[0], range->entry_count);
+        else
+            uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, range->start_index, invalid_ptes, range->entry_count);
+    }
+
+    // A CPU membar is needed between the PTE writes and the subsequent TLB
+    // invalidate. Work submission guarantees such a membar.
+    status = page_tree_begin_acquire(tree,
+                                     NULL,
+                                     &push,
+                                     "Invalidating [0x%llx, 0x%llx)",
+                                     range_vec->start,
+                                     range_vec->start + range_vec->size);
+    if (status != NV_OK)
+        return status;
+
+    uvm_tlb_batch_single_invalidate(tree, &push, range_vec->start, range_vec->size, range_vec->page_size, tlb_membar);
+    return page_tree_end_and_wait(tree, &push);
+}
+
+static NV_STATUS uvm_page_table_range_vec_clear_ptes_gpu(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
 {
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;
@@ -1545,6 +1819,7 @@ NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_
    UVM_ASSERT(range_vec);
    UVM_ASSERT(tree);
    UVM_ASSERT(gpu);
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));

    i = 0;
    while (i < range_vec->range_count) {
@@ -1595,6 +1870,14 @@ done:
    return status;
 }

+NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
+{
+    if (uvm_mmu_use_cpu(range_vec->tree))
+        return uvm_page_table_range_vec_clear_ptes_cpu(range_vec, tlb_membar);
+    else
+        return uvm_page_table_range_vec_clear_ptes_gpu(range_vec, tlb_membar);
+}
+
 void uvm_page_table_range_vec_deinit(uvm_page_table_range_vec_t *range_vec)
 {
    size_t i;
@@ -1626,10 +1909,58 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec)
    uvm_kvfree(range_vec);
 }

-NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
-                                              uvm_membar_t tlb_membar,
-                                              uvm_page_table_range_pte_maker_t pte_maker,
-                                              void *caller_data)
+static NV_STATUS uvm_page_table_range_vec_write_ptes_cpu(uvm_page_table_range_vec_t *range_vec,
+                                                         uvm_membar_t tlb_membar,
+                                                         uvm_page_table_range_pte_maker_t pte_maker,
+                                                         void *caller_data)
+{
+    NV_STATUS status;
+    size_t i;
+    uvm_page_tree_t *tree = range_vec->tree;
+    NvU32 entry_size = uvm_mmu_pte_size(tree, range_vec->page_size);
+    uvm_push_t push;
+    NvU64 offset = 0;
+
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+
+    // Enforce ordering with prior accesses to the pages being mapped before the
+    // mappings are activated.
+    mb();
+
+    for (i = 0; i < range_vec->range_count; ++i) {
+        uvm_page_table_range_t *range = &range_vec->ranges[i];
+        uvm_mmu_page_table_alloc_t *dir = &range->table->phys_alloc;
+        NvU32 entry;
+
+        for (entry = range->start_index; entry < range->entry_count; ++entry) {
+            NvU64 pte_bits[2] = {pte_maker(range_vec, offset, caller_data), 0};
+
+            if (entry_size == 8)
+                uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, entry, pte_bits[0], 1);
+            else
+                uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, entry, pte_bits, 1);
+
+            offset += range_vec->page_size;
+        }
+    }
+
+    status = page_tree_begin_acquire(tree,
+                                     NULL,
+                                     &push,
+                                     "Invalidating [0x%llx, 0x%llx)",
+                                     range_vec->start,
+                                     range_vec->start + range_vec->size);
+    if (status != NV_OK)
+        return status;
+
+    uvm_tlb_batch_single_invalidate(tree, &push, range_vec->start, range_vec->size, range_vec->page_size, tlb_membar);
+    return page_tree_end_and_wait(tree, &push);
+}
+
+static NV_STATUS uvm_page_table_range_vec_write_ptes_gpu(uvm_page_table_range_vec_t *range_vec,
+                                                         uvm_membar_t tlb_membar,
+                                                         uvm_page_table_range_pte_maker_t pte_maker,
+                                                         void *caller_data)
 {
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;
@@ -1650,6 +1981,8 @@ NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_

    NvU32 max_entries_per_push = max_total_entry_size_per_push / entry_size;

+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
    for (i = 0; i < range_vec->range_count; ++i) {
        uvm_page_table_range_t *range = &range_vec->ranges[i];
        NvU64 range_start = range_vec_calc_range_start(range_vec, i);
@@ -1728,6 +2061,17 @@ done:
    return status;
 }

+NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
+                                              uvm_membar_t tlb_membar,
+                                              uvm_page_table_range_pte_maker_t pte_maker,
+                                              void *caller_data)
+{
+    if (uvm_mmu_use_cpu(range_vec->tree))
+        return uvm_page_table_range_vec_write_ptes_cpu(range_vec, tlb_membar, pte_maker, caller_data);
+    else
+        return uvm_page_table_range_vec_write_ptes_gpu(range_vec, tlb_membar, pte_maker, caller_data);
+}
+
 typedef struct identity_mapping_pte_maker_data_struct
 {
    NvU64 phys_offset;
@@ -1745,13 +2089,12 @@ static NvU64 identity_mapping_pte_maker(uvm_page_table_range_vec_t *range_vec, N
 }

 static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
-                                         NvU64 base,
+                                         uvm_gpu_identity_mapping_t *mapping,
                                         NvU64 size,
                                         uvm_aperture_t aperture,
                                         NvU64 phys_offset,
                                         NvU32 page_size,
-                                         uvm_pmm_alloc_flags_t pmm_flags,
-                                         uvm_page_table_range_vec_t **range_vec)
+                                         uvm_pmm_alloc_flags_t pmm_flags)
 {
    NV_STATUS status;
    identity_mapping_pte_maker_data_t data =
@@ -1761,32 +2104,36 @@ static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
    };

    status = uvm_page_table_range_vec_create(&gpu->address_space_tree,
-                                             base,
+                                             mapping->base,
                                             size,
                                             page_size,
                                             pmm_flags,
-                                             range_vec);
+                                             &mapping->range_vec);
    if (status != NV_OK) {
        UVM_ERR_PRINT("Failed to init range vec for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
                       aperture,
-                       base,
-                       base + size,
+                       mapping->base,
+                       mapping->base + size,
                       nvstatusToString(status),
                       uvm_gpu_name(gpu));
        return status;
    }

-    status = uvm_page_table_range_vec_write_ptes(*range_vec, UVM_MEMBAR_NONE, identity_mapping_pte_maker, &data);
+    status = uvm_page_table_range_vec_write_ptes(mapping->range_vec,
+                                                 UVM_MEMBAR_NONE,
+                                                 identity_mapping_pte_maker,
+                                                 &data);
    if (status != NV_OK) {
        UVM_ERR_PRINT("Failed to write PTEs for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
                      aperture,
-                      base,
-                      base + size,
+                      mapping->base,
+                      mapping->base + size,
                      nvstatusToString(status),
                      uvm_gpu_name(gpu));
        return status;
    }

+    mapping->ready = true;
    return NV_OK;
 }

@@ -1795,6 +2142,10 @@ static void destroy_identity_mapping(uvm_gpu_identity_mapping_t *mapping)
    if (mapping->range_vec == NULL)
        return;

+    // Tell the teardown routines they can't use this mapping as part of their
+    // teardown.
+    mapping->ready = false;
+
    (void)uvm_page_table_range_vec_clear_ptes(mapping->range_vec, UVM_MEMBAR_SYS);
    uvm_page_table_range_vec_destroy(mapping->range_vec);
    mapping->range_vec = NULL;
@@ -1802,7 +2153,7 @@ static void destroy_identity_mapping(uvm_gpu_identity_mapping_t *mapping)

 bool uvm_mmu_gpu_needs_static_vidmem_mapping(uvm_gpu_t *gpu)
 {
-    return false;
+    return !gpu->parent->ce_phys_vidmem_write_supported;
 }

 bool uvm_mmu_gpu_needs_dynamic_vidmem_mapping(uvm_gpu_t *gpu)
@@ -1838,13 +2189,12 @@ NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
    flat_mapping->base = gpu->parent->flat_vidmem_va_base;

    return create_identity_mapping(gpu,
-                                   flat_mapping->base,
+                                   flat_mapping,
                                   size,
                                   aperture,
                                   phys_offset,
                                   page_size,
-                                   UVM_PMM_ALLOC_FLAGS_EVICT,
-                                   &flat_mapping->range_vec);
+                                   UVM_PMM_ALLOC_FLAGS_EVICT);
 }

 static void destroy_static_vidmem_mapping(uvm_gpu_t *gpu)
@@ -1884,13 +2234,12 @@ NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
    UVM_ASSERT(peer_mapping->base);

    return create_identity_mapping(gpu,
-                                   peer_mapping->base,
+                                   peer_mapping,
                                   size,
                                   aperture,
                                   phys_offset,
                                   page_size,
-                                   UVM_PMM_ALLOC_FLAGS_EVICT,
-                                   &peer_mapping->range_vec);
+                                   UVM_PMM_ALLOC_FLAGS_EVICT);
 }

 void uvm_mmu_destroy_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
@@ -2304,14 +2653,14 @@ static NV_STATUS create_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
    // SR-IOV each mapping addition adds a lot of overhead due to vGPU plugin
    // involvement), metadata memory footprint (inversely proportional to the
    // mapping size), etc.
-    mapping_size = 4ULL * 1024 * 1024 * 1024;
+    mapping_size = 4 * UVM_SIZE_1GB;

    // The mapping size should be at least 1GB, due to bitlock limitations. This
    // shouldn't be a problem because the expectation is to use 512MB PTEs, and
    // using a granularity of 1GB already results in allocating a large array of
    // sysmem mappings with 128K entries.
    UVM_ASSERT(is_power_of_2(mapping_size));
-    UVM_ASSERT(mapping_size >= 1ULL * 1024 * 1024 * 1024);
+    UVM_ASSERT(mapping_size >= UVM_SIZE_1GB);
    UVM_ASSERT(mapping_size >= uvm_mmu_biggest_page_size(&gpu->address_space_tree));
    UVM_ASSERT(mapping_size <= flat_sysmem_va_size);

@@ -2367,13 +2716,12 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
            sysmem_mapping->base = virtual_address.address;

            status = create_identity_mapping(gpu,
-                                             sysmem_mapping->base,
+                                             sysmem_mapping,
                                             gpu->sysmem_mappings.mapping_size,
                                             UVM_APERTURE_SYS,
                                             phys_offset,
                                             page_size,
-                                             pmm_flags,
-                                             &sysmem_mapping->range_vec);
+                                             pmm_flags);
        }

        sysmem_mapping_unlock(gpu, sysmem_mapping);
@@ -2394,14 +2742,14 @@ NV_STATUS uvm_mmu_create_flat_mappings(uvm_gpu_t *gpu)
 {
    NV_STATUS status;

-    status = create_dynamic_sysmem_mapping(gpu);
-    if (status != NV_OK)
-        return status;
-
    status = create_static_vidmem_mapping(gpu);
    if (status != NV_OK)
        goto error;

+    status = create_dynamic_sysmem_mapping(gpu);
+    if (status != NV_OK)
+        return status;
+
    status = create_dynamic_vidmem_mapping(gpu);
    if (status != NV_OK)
        goto error;
@@ -2416,8 +2764,16 @@ error:
 void uvm_mmu_destroy_flat_mappings(uvm_gpu_t *gpu)
 {
    destroy_dynamic_vidmem_mapping(gpu);
-    destroy_static_vidmem_mapping(gpu);
    destroy_dynamic_sysmem_mapping(gpu);
+    destroy_static_vidmem_mapping(gpu);
+}
+
+uvm_gpu_address_t uvm_mmu_gpu_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
+{
+    if (phys_addr.aperture == UVM_APERTURE_VID && !gpu->parent->ce_phys_vidmem_write_supported)
+        return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address);
+
+    return uvm_gpu_address_from_phys(phys_addr);
 }

 NV_STATUS uvm_test_invalidate_tlb(UVM_TEST_INVALIDATE_TLB_PARAMS *params, struct file *filp)