535.43.02

This commit is contained in:
Andy Ritger
2023-05-30 10:11:36 -07:00
parent 6dd092ddb7
commit eb5c7665a1
1403 changed files with 295367 additions and 86235 deletions

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2022 NVIDIA Corporation
Copyright (c) 2015-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -34,6 +34,7 @@
#include "uvm_mem.h"
#include "uvm_va_space.h"
#include <linux/mm.h>
// The page tree has 6 levels on Hopper+ GPUs, and the root is never freed by a
// normal 'put' operation which leaves a maximum of 5 levels.
@@ -102,7 +103,7 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,
NvU64 dma_addr;
unsigned long flags = __GFP_ZERO;
uvm_memcg_context_t memcg_context;
uvm_va_space_t *va_space;
uvm_va_space_t *va_space = NULL;
struct mm_struct *mm = NULL;
if (tree->type == UVM_PAGE_TREE_TYPE_USER && tree->gpu_va_space && UVM_CGROUP_ACCOUNTING_SUPPORTED()) {
@@ -244,6 +245,84 @@ static void page_table_range_init(uvm_page_table_range_t *range,
dir->ref_count += range->entry_count;
}
static bool uvm_mmu_use_cpu(uvm_page_tree_t *tree)
{
// When physical CE writes can't be used for vidmem we use a flat virtual
// mapping instead. The GPU PTEs for that flat mapping have to be
// bootstrapped using the CPU.
return tree->location != UVM_APERTURE_SYS &&
!tree->gpu->parent->ce_phys_vidmem_write_supported &&
!tree->gpu->static_flat_mapping.ready;
}
// uvm_mmu_page_table_page() and the uvm_mmu_page_table_cpu_* family of
// functions can only be used when uvm_mmu_use_cpu() returns true, which implies
// a coherent system.
static struct page *uvm_mmu_page_table_page(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
{
// All platforms that require CPU PTE writes for bootstrapping can fit
// tables within a page.
UVM_ASSERT(phys_alloc->size <= PAGE_SIZE);
if (phys_alloc->addr.aperture == UVM_APERTURE_SYS)
return phys_alloc->handle.page;
return uvm_gpu_chunk_to_page(&gpu->pmm, phys_alloc->handle.chunk);
}
static void *uvm_mmu_page_table_cpu_map(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
{
struct page *page = uvm_mmu_page_table_page(gpu, phys_alloc);
NvU64 page_offset = offset_in_page(phys_alloc->addr.address);
return (char *)kmap(page) + page_offset;
}
static void uvm_mmu_page_table_cpu_unmap(uvm_gpu_t *gpu, uvm_mmu_page_table_alloc_t *phys_alloc)
{
kunmap(uvm_mmu_page_table_page(gpu, phys_alloc));
}
static void uvm_mmu_page_table_cpu_memset_8(uvm_gpu_t *gpu,
uvm_mmu_page_table_alloc_t *phys_alloc,
NvU32 start_index,
NvU64 pattern,
NvU32 num_entries)
{
NvU64 *ptr = uvm_mmu_page_table_cpu_map(gpu, phys_alloc);
size_t i;
UVM_ASSERT(IS_ALIGNED((uintptr_t)ptr, sizeof(*ptr)));
UVM_ASSERT((start_index + num_entries) * sizeof(*ptr) <= phys_alloc->size);
for (i = 0; i < num_entries; i++)
ptr[start_index + i] = pattern;
uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
}
static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
uvm_mmu_page_table_alloc_t *phys_alloc,
NvU32 start_index,
NvU64 *pattern,
NvU32 num_entries)
{
struct
{
NvU64 u0, u1;
} *ptr;
size_t i;
ptr = uvm_mmu_page_table_cpu_map(gpu, phys_alloc);
UVM_ASSERT(IS_ALIGNED((uintptr_t)ptr, sizeof(*ptr)));
UVM_ASSERT((start_index + num_entries) * sizeof(*ptr) <= phys_alloc->size);
for (i = 0; i < num_entries; i++)
memcpy(&ptr[start_index + i], pattern, sizeof(*ptr));
uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
}
static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
{
NvU64 clear_bits[2];
@@ -262,10 +341,19 @@ static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_direc
}
// initialize the memory to a reasonable value
tree->gpu->parent->ce_hal->memset_8(push,
uvm_gpu_address_from_phys(dir->phys_alloc.addr),
if (push) {
tree->gpu->parent->ce_hal->memset_8(push,
uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
*clear_bits,
dir->phys_alloc.size);
}
else {
uvm_mmu_page_table_cpu_memset_8(tree->gpu,
&dir->phys_alloc,
0,
*clear_bits,
dir->phys_alloc.size);
dir->phys_alloc.size / sizeof(*clear_bits));
}
}
static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@@ -321,25 +409,44 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
}
// pde_fill() populates pde_count PDE entries (starting at start_index) with
// the same mapping, i.e., with the same physical address (phys_addr).
static void pde_fill(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
static void pde_fill_cpu(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr)
{
NvU64 pde_data[2], entry_size;
uvm_gpu_address_t pde_entry_addr;
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
UVM_ASSERT(uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
tree->hal->make_pde(pde_data, phys_addr, depth);
if (entry_size == sizeof(pde_data[0]))
uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
else
uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
}
static void pde_fill_gpu(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
NvU64 pde_data[2], entry_size;
uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
tree->hal->make_pde(pde_data, phys_addr, depth);
pde_entry_addr = uvm_gpu_address_from_phys(directory->addr);
pde_entry_addr.address += start_index * entry_size;
if (entry_size == sizeof(pde_data[0])) {
@@ -386,6 +493,24 @@ static void pde_fill(uvm_page_tree_t *tree,
}
}
// pde_fill() populates pde_count PDE entries (starting at start_index) with
// the same mapping, i.e., with the same physical address (phys_addr).
static void pde_fill(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
if (push)
pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
else
pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
}
static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
uvm_page_directory_t *parent,
NvU32 index_in_parent)
@@ -426,7 +551,11 @@ static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU
dir->ref_count--;
}
static void pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size, void *push)
static void pde_clear(uvm_page_tree_t *tree,
uvm_page_directory_t *dir,
NvU32 entry_index,
NvU32 page_size,
uvm_push_t *push)
{
host_pde_clear(tree, dir, entry_index, page_size);
pde_write(tree, dir, entry_index, false, push);
@@ -492,12 +621,62 @@ static NV_STATUS page_tree_end_and_wait(uvm_page_tree_t *tree, uvm_push_t *push)
return NV_OK;
}
// initialize new page tables and insert them into the tree
static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
NvU32 page_size,
NvS32 invalidate_depth,
NvU32 used_count,
uvm_page_directory_t **dirs_used)
static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
NvU32 page_size,
NvS32 invalidate_depth,
NvU32 used_count,
uvm_page_directory_t **dirs_used)
{
NvS32 i;
uvm_push_t push;
NV_STATUS status;
uvm_assert_mutex_locked(&tree->lock);
UVM_ASSERT(uvm_mmu_use_cpu(tree));
if (used_count == 0)
return NV_OK;
status = uvm_tracker_wait(&tree->tracker);
if (status != NV_OK)
return status;
for (i = 0; i < used_count; i++)
phys_mem_init(tree, page_size, dirs_used[i], NULL);
// Only a single membar is needed between the memsets of the page tables
// and the writes of the PDEs pointing to those page tables.
mb();
// write entries bottom up, so that they are valid once they're inserted
// into the tree
for (i = used_count - 1; i >= 0; i--)
pde_write(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, false, NULL);
// A CPU membar is needed between the PDE writes and the subsequent TLB
// invalidate. Work submission guarantees such a membar.
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "%u dirs", used_count);
if (status != NV_OK)
return status;
UVM_ASSERT(invalidate_depth >= 0);
// See the comments in write_gpu_state_gpu()
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
uvm_page_tree_pdb(tree)->addr,
invalidate_depth,
UVM_MEMBAR_NONE);
page_tree_end(tree, &push);
page_tree_tracker_overwrite_with_push(tree, &push);
return NV_OK;
}
static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,
NvU32 page_size,
NvS32 invalidate_depth,
NvU32 used_count,
uvm_page_directory_t **dirs_used)
{
NvS32 i;
uvm_push_t push;
@@ -508,11 +687,12 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
uvm_membar_t membar_after_writes = UVM_MEMBAR_GPU;
uvm_assert_mutex_locked(&tree->lock);
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
if (used_count == 0)
return NV_OK;
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "write_gpu_state: %u dirs", used_count);
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "%u dirs", used_count);
if (status != NV_OK)
return status;
@@ -533,15 +713,15 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
// Only a single membar is needed between the memsets of the page tables
// and the writes of the PDEs pointing to those page tables.
// The membar can be local if all of the page tables and PDEs are in GPU memory,
// but must be a sysmembar if any of them are in sysmem.
tree->gpu->parent->host_hal->wait_for_idle(&push);
uvm_hal_membar(tree->gpu, &push, membar_after_writes);
// The membar can be local if all of the page tables and PDEs are in GPU
// memory, but must be a sysmembar if any of them are in sysmem.
uvm_hal_wfi_membar(&push, membar_after_writes);
// Reset back to a local membar by default
membar_after_writes = UVM_MEMBAR_GPU;
// write entries bottom up, so that they are valid once they're inserted into the tree
// write entries bottom up, so that they are valid once they're inserted
// into the tree
for (i = used_count - 1; i >= 0; i--) {
uvm_page_directory_t *dir = dirs_used[i];
@@ -553,18 +733,19 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
// If any of the written PDEs is in sysmem, a sysmembar is needed before
// the TLB invalidate.
// Notably sysmembar is needed even though the writer (CE) and reader (MMU) are
// on the same GPU, because CE physical writes take the L2 bypass path.
// Notably sysmembar is needed even though the writer (CE) and reader
// (MMU) are on the same GPU, because CE physical writes take the L2
// bypass path.
if (dir->host_parent->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
membar_after_writes = UVM_MEMBAR_SYS;
}
tree->gpu->parent->host_hal->wait_for_idle(&push);
uvm_hal_membar(tree->gpu, &push, membar_after_writes);
uvm_hal_wfi_membar(&push, membar_after_writes);
UVM_ASSERT(invalidate_depth >= 0);
// Upgrades don't have to flush out accesses, so no membar is needed on the TLB invalidate.
// Upgrades don't have to flush out accesses, so no membar is needed on the
// TLB invalidate.
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
uvm_page_tree_pdb(tree)->addr,
invalidate_depth,
@@ -582,6 +763,19 @@ static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
return NV_OK;
}
// initialize new page tables and insert them into the tree
static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
NvU32 page_size,
NvS32 invalidate_depth,
NvU32 used_count,
uvm_page_directory_t **dirs_used)
{
if (uvm_mmu_use_cpu(tree))
return write_gpu_state_cpu(tree, page_size, invalidate_depth, used_count, dirs_used);
else
return write_gpu_state_gpu(tree, page_size, invalidate_depth, used_count, dirs_used);
}
static void free_unused_directories(uvm_page_tree_t *tree,
NvU32 used_count,
uvm_page_directory_t **dirs_used,
@@ -633,6 +827,8 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
uvm_pte_batch_t batch;
NvU32 entry_size;
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
// Allocate the ptes_invalid_4k.
status = allocate_page_table(tree, UVM_PAGE_SIZE_4K, &tree->map_remap.ptes_invalid_4k);
if (status != NV_OK)
@@ -718,6 +914,9 @@ error:
//
// In SR-IOV heavy the the page tree must be in vidmem, to prevent guest drivers
// from updating GPU page tables without hypervisor knowledge.
// When the Confidential Computing feature is enabled, all kernel
// allocations must be made in the CPR of vidmem. This is a hardware security
// constraint.
// Inputs Outputs
// init location | uvm_page_table_location || tree->location | tree->location_sys_fallback
// -------------|-------------------------||----------------|----------------
@@ -734,7 +933,8 @@ static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t locatio
(location == UVM_APERTURE_DEFAULT),
"Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);
should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu);
should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu)
|| uvm_conf_computing_mode_enabled(tree->gpu);
// The page tree of a "fake" GPU used during page tree testing can be in
// sysmem even if should_location_be_vidmem is true. A fake GPU can be
@@ -798,6 +998,11 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
return status;
}
if (uvm_mmu_use_cpu(tree)) {
phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, NULL);
return NV_OK;
}
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "init page tree");
if (status != NV_OK)
return status;
@@ -858,7 +1063,7 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
uvm_page_directory_t *free_queue[MAX_OPERATION_DEPTH];
uvm_page_directory_t *dir = range->table;
uvm_push_t push;
NV_STATUS status;
NV_STATUS status = NV_OK;
NvU32 invalidate_depth = 0;
// The logic of what membar is needed when is pretty subtle, please refer to
@@ -880,34 +1085,58 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
uvm_membar_t this_membar;
if (free_count == 0) {
if (uvm_mmu_use_cpu(tree))
status = uvm_tracker_wait(&tree->tracker);
// begin a push which will be submitted before the memory gets freed
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "put ptes: start: %u, count: %u",
range->start_index, range->entry_count);
// Failure to get a push can only happen if we've hit a fatal UVM
// channel error. We can't perform the unmap, so just leave things
// in place for debug.
if (status == NV_OK) {
// Begin a push which will be submitted before the memory gets
// freed.
//
// When writing with the CPU we don't strictly need to begin
// this push until after the writes are done, but doing it here
// doesn't hurt and makes the function's logic simpler.
status = page_tree_begin_acquire(tree,
&tree->tracker,
&push,
"put ptes: start: %u, count: %u",
range->start_index,
range->entry_count);
}
// Failure to wait for a tracker or get a push can only happen if
// we've hit a fatal UVM channel error. We can't perform the unmap,
// so just leave things in place for debug.
if (status != NV_OK) {
UVM_ASSERT(status == uvm_global_get_status());
dir->ref_count += range->entry_count;
uvm_mutex_unlock(&tree->lock);
return;
goto done;
}
}
// All writes can be pipelined as put_ptes() cannot be called with any
// operations pending on the affected PTEs and PDEs.
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
if (uvm_mmu_use_cpu(tree)) {
pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, NULL);
}
else {
// All writes can be pipelined as put_ptes() cannot be called with
// any operations pending on the affected PTEs and PDEs.
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
// Don't issue any membars as part of the clear, a single membar will be
// done below before the invalidate.
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, &push);
// Don't issue any membars as part of the clear, a single membar
// will be done below before the invalidate.
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, &push);
}
invalidate_depth = dir->host_parent->depth;
// Take the membar with the widest scope of any of the pointed-to PDEs
this_membar = uvm_hal_downgrade_membar_type(tree->gpu, dir->phys_alloc.addr.aperture == UVM_APERTURE_VID);
// If we're using the CPU to do the write a SYS membar is required.
// Otherwise, take the membar with the widest scope of any of the
// pointed-to PDEs.
if (uvm_mmu_use_cpu(tree))
this_membar = UVM_MEMBAR_SYS;
else
this_membar = uvm_hal_downgrade_membar_type(tree->gpu, dir->phys_alloc.addr.aperture == UVM_APERTURE_VID);
membar_after_invalidate = max(membar_after_invalidate, this_membar);
// If any of the cleared PDEs were in sysmem then a SYS membar is
@@ -923,23 +1152,28 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
dir = parent;
}
if (free_count == 0) {
uvm_mutex_unlock(&tree->lock);
return;
}
if (free_count == 0)
goto done;
if (uvm_mmu_use_cpu(tree))
mb();
else
uvm_hal_wfi_membar(&push, membar_after_pde_clears);
tree->gpu->parent->host_hal->wait_for_idle(&push);
uvm_hal_membar(tree->gpu, &push, membar_after_pde_clears);
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
uvm_page_tree_pdb(tree)->addr,
invalidate_depth,
membar_after_invalidate);
// We just did the appropriate membar above, no need for another one in push_end().
// At least currently as if the L2 bypass path changes to only require a GPU
// membar between PDE write and TLB invalidate, we'll need to push a
// sysmembar so the end-of-push semaphore is ordered behind the PDE writes.
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
if (!uvm_mmu_use_cpu(tree)) {
// We just did the appropriate membar above, no need for another one in
// push_end(). If the L2 bypass path changes to only require a GPU
// membar between PDE write and TLB invalidate, we'll need to push a
// sysmembar so the end-of-push semaphore is ordered behind the PDE
// writes.
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
}
page_tree_end(tree, &push);
page_tree_tracker_overwrite_with_push(tree, &push);
@@ -949,6 +1183,7 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
uvm_kvfree(free_queue[i]);
}
done:
uvm_mutex_unlock(&tree->lock);
}
@@ -1255,19 +1490,22 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
UVM_ASSERT(pte_dir->depth == tree->hal->page_table_depth(page_size));
// The flat mappings should always be set up when executing this path
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %u", page_size);
if (status != NV_OK)
return status;
tree->gpu->parent->ce_hal->memset_8(&push,
uvm_gpu_address_from_phys(pte_dir->phys_alloc.addr),
uvm_mmu_gpu_address(tree->gpu, pte_dir->phys_alloc.addr),
tree->hal->poisoned_pte(),
pte_dir->phys_alloc.size);
// If both the new PTEs and the parent PDE are in vidmem, then a GPU-
// local membar is enough to keep the memset of the PTEs ordered with
// any later write of the PDE. Otherwise we need a sysmembar. See the
// comments in write_gpu_state.
// comments in write_gpu_state_gpu.
if (pte_dir->phys_alloc.addr.aperture == UVM_APERTURE_VID &&
parent->phys_alloc.addr.aperture == UVM_APERTURE_VID)
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
@@ -1527,7 +1765,43 @@ NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range
return NV_OK;
}
NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
static NV_STATUS uvm_page_table_range_vec_clear_ptes_cpu(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
{
uvm_page_tree_t *tree = range_vec->tree;
NvU32 entry_size = uvm_mmu_pte_size(tree, range_vec->page_size);
NvU64 invalid_ptes[2] = {0, 0};
uvm_push_t push;
NV_STATUS status;
size_t i;
UVM_ASSERT(uvm_mmu_use_cpu(tree));
for (i = 0; i < range_vec->range_count; ++i) {
uvm_page_table_range_t *range = &range_vec->ranges[i];
uvm_mmu_page_table_alloc_t *dir = &range->table->phys_alloc;
if (entry_size == 8)
uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, range->start_index, invalid_ptes[0], range->entry_count);
else
uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, range->start_index, invalid_ptes, range->entry_count);
}
// A CPU membar is needed between the PTE writes and the subsequent TLB
// invalidate. Work submission guarantees such a membar.
status = page_tree_begin_acquire(tree,
NULL,
&push,
"Invalidating [0x%llx, 0x%llx)",
range_vec->start,
range_vec->start + range_vec->size);
if (status != NV_OK)
return status;
uvm_tlb_batch_single_invalidate(tree, &push, range_vec->start, range_vec->size, range_vec->page_size, tlb_membar);
return page_tree_end_and_wait(tree, &push);
}
static NV_STATUS uvm_page_table_range_vec_clear_ptes_gpu(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
{
NV_STATUS status = NV_OK;
NV_STATUS tracker_status;
@@ -1545,6 +1819,7 @@ NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_
UVM_ASSERT(range_vec);
UVM_ASSERT(tree);
UVM_ASSERT(gpu);
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
i = 0;
while (i < range_vec->range_count) {
@@ -1595,6 +1870,14 @@ done:
return status;
}
NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
{
if (uvm_mmu_use_cpu(range_vec->tree))
return uvm_page_table_range_vec_clear_ptes_cpu(range_vec, tlb_membar);
else
return uvm_page_table_range_vec_clear_ptes_gpu(range_vec, tlb_membar);
}
void uvm_page_table_range_vec_deinit(uvm_page_table_range_vec_t *range_vec)
{
size_t i;
@@ -1626,10 +1909,58 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec)
uvm_kvfree(range_vec);
}
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker,
void *caller_data)
static NV_STATUS uvm_page_table_range_vec_write_ptes_cpu(uvm_page_table_range_vec_t *range_vec,
uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker,
void *caller_data)
{
NV_STATUS status;
size_t i;
uvm_page_tree_t *tree = range_vec->tree;
NvU32 entry_size = uvm_mmu_pte_size(tree, range_vec->page_size);
uvm_push_t push;
NvU64 offset = 0;
UVM_ASSERT(uvm_mmu_use_cpu(tree));
// Enforce ordering with prior accesses to the pages being mapped before the
// mappings are activated.
mb();
for (i = 0; i < range_vec->range_count; ++i) {
uvm_page_table_range_t *range = &range_vec->ranges[i];
uvm_mmu_page_table_alloc_t *dir = &range->table->phys_alloc;
NvU32 entry;
for (entry = range->start_index; entry < range->entry_count; ++entry) {
NvU64 pte_bits[2] = {pte_maker(range_vec, offset, caller_data), 0};
if (entry_size == 8)
uvm_mmu_page_table_cpu_memset_8(tree->gpu, dir, entry, pte_bits[0], 1);
else
uvm_mmu_page_table_cpu_memset_16(tree->gpu, dir, entry, pte_bits, 1);
offset += range_vec->page_size;
}
}
status = page_tree_begin_acquire(tree,
NULL,
&push,
"Invalidating [0x%llx, 0x%llx)",
range_vec->start,
range_vec->start + range_vec->size);
if (status != NV_OK)
return status;
uvm_tlb_batch_single_invalidate(tree, &push, range_vec->start, range_vec->size, range_vec->page_size, tlb_membar);
return page_tree_end_and_wait(tree, &push);
}
static NV_STATUS uvm_page_table_range_vec_write_ptes_gpu(uvm_page_table_range_vec_t *range_vec,
uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker,
void *caller_data)
{
NV_STATUS status = NV_OK;
NV_STATUS tracker_status;
@@ -1650,6 +1981,8 @@ NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_
NvU32 max_entries_per_push = max_total_entry_size_per_push / entry_size;
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
for (i = 0; i < range_vec->range_count; ++i) {
uvm_page_table_range_t *range = &range_vec->ranges[i];
NvU64 range_start = range_vec_calc_range_start(range_vec, i);
@@ -1728,6 +2061,17 @@ done:
return status;
}
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker,
void *caller_data)
{
if (uvm_mmu_use_cpu(range_vec->tree))
return uvm_page_table_range_vec_write_ptes_cpu(range_vec, tlb_membar, pte_maker, caller_data);
else
return uvm_page_table_range_vec_write_ptes_gpu(range_vec, tlb_membar, pte_maker, caller_data);
}
typedef struct identity_mapping_pte_maker_data_struct
{
NvU64 phys_offset;
@@ -1745,13 +2089,12 @@ static NvU64 identity_mapping_pte_maker(uvm_page_table_range_vec_t *range_vec, N
}
static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
NvU64 base,
uvm_gpu_identity_mapping_t *mapping,
NvU64 size,
uvm_aperture_t aperture,
NvU64 phys_offset,
NvU32 page_size,
uvm_pmm_alloc_flags_t pmm_flags,
uvm_page_table_range_vec_t **range_vec)
uvm_pmm_alloc_flags_t pmm_flags)
{
NV_STATUS status;
identity_mapping_pte_maker_data_t data =
@@ -1761,32 +2104,36 @@ static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
};
status = uvm_page_table_range_vec_create(&gpu->address_space_tree,
base,
mapping->base,
size,
page_size,
pmm_flags,
range_vec);
&mapping->range_vec);
if (status != NV_OK) {
UVM_ERR_PRINT("Failed to init range vec for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
aperture,
base,
base + size,
mapping->base,
mapping->base + size,
nvstatusToString(status),
uvm_gpu_name(gpu));
return status;
}
status = uvm_page_table_range_vec_write_ptes(*range_vec, UVM_MEMBAR_NONE, identity_mapping_pte_maker, &data);
status = uvm_page_table_range_vec_write_ptes(mapping->range_vec,
UVM_MEMBAR_NONE,
identity_mapping_pte_maker,
&data);
if (status != NV_OK) {
UVM_ERR_PRINT("Failed to write PTEs for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
aperture,
base,
base + size,
mapping->base,
mapping->base + size,
nvstatusToString(status),
uvm_gpu_name(gpu));
return status;
}
mapping->ready = true;
return NV_OK;
}
@@ -1795,6 +2142,10 @@ static void destroy_identity_mapping(uvm_gpu_identity_mapping_t *mapping)
if (mapping->range_vec == NULL)
return;
// Tell the teardown routines they can't use this mapping as part of their
// teardown.
mapping->ready = false;
(void)uvm_page_table_range_vec_clear_ptes(mapping->range_vec, UVM_MEMBAR_SYS);
uvm_page_table_range_vec_destroy(mapping->range_vec);
mapping->range_vec = NULL;
@@ -1802,7 +2153,7 @@ static void destroy_identity_mapping(uvm_gpu_identity_mapping_t *mapping)
bool uvm_mmu_gpu_needs_static_vidmem_mapping(uvm_gpu_t *gpu)
{
return false;
return !gpu->parent->ce_phys_vidmem_write_supported;
}
bool uvm_mmu_gpu_needs_dynamic_vidmem_mapping(uvm_gpu_t *gpu)
@@ -1838,13 +2189,12 @@ NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
flat_mapping->base = gpu->parent->flat_vidmem_va_base;
return create_identity_mapping(gpu,
flat_mapping->base,
flat_mapping,
size,
aperture,
phys_offset,
page_size,
UVM_PMM_ALLOC_FLAGS_EVICT,
&flat_mapping->range_vec);
UVM_PMM_ALLOC_FLAGS_EVICT);
}
static void destroy_static_vidmem_mapping(uvm_gpu_t *gpu)
@@ -1884,13 +2234,12 @@ NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
UVM_ASSERT(peer_mapping->base);
return create_identity_mapping(gpu,
peer_mapping->base,
peer_mapping,
size,
aperture,
phys_offset,
page_size,
UVM_PMM_ALLOC_FLAGS_EVICT,
&peer_mapping->range_vec);
UVM_PMM_ALLOC_FLAGS_EVICT);
}
void uvm_mmu_destroy_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
@@ -2304,14 +2653,14 @@ static NV_STATUS create_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
// SR-IOV each mapping addition adds a lot of overhead due to vGPU plugin
// involvement), metadata memory footprint (inversely proportional to the
// mapping size), etc.
mapping_size = 4ULL * 1024 * 1024 * 1024;
mapping_size = 4 * UVM_SIZE_1GB;
// The mapping size should be at least 1GB, due to bitlock limitations. This
// shouldn't be a problem because the expectation is to use 512MB PTEs, and
// using a granularity of 1GB already results in allocating a large array of
// sysmem mappings with 128K entries.
UVM_ASSERT(is_power_of_2(mapping_size));
UVM_ASSERT(mapping_size >= 1ULL * 1024 * 1024 * 1024);
UVM_ASSERT(mapping_size >= UVM_SIZE_1GB);
UVM_ASSERT(mapping_size >= uvm_mmu_biggest_page_size(&gpu->address_space_tree));
UVM_ASSERT(mapping_size <= flat_sysmem_va_size);
@@ -2367,13 +2716,12 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
sysmem_mapping->base = virtual_address.address;
status = create_identity_mapping(gpu,
sysmem_mapping->base,
sysmem_mapping,
gpu->sysmem_mappings.mapping_size,
UVM_APERTURE_SYS,
phys_offset,
page_size,
pmm_flags,
&sysmem_mapping->range_vec);
pmm_flags);
}
sysmem_mapping_unlock(gpu, sysmem_mapping);
@@ -2394,14 +2742,14 @@ NV_STATUS uvm_mmu_create_flat_mappings(uvm_gpu_t *gpu)
{
NV_STATUS status;
status = create_dynamic_sysmem_mapping(gpu);
if (status != NV_OK)
return status;
status = create_static_vidmem_mapping(gpu);
if (status != NV_OK)
goto error;
status = create_dynamic_sysmem_mapping(gpu);
if (status != NV_OK)
return status;
status = create_dynamic_vidmem_mapping(gpu);
if (status != NV_OK)
goto error;
@@ -2416,8 +2764,16 @@ error:
void uvm_mmu_destroy_flat_mappings(uvm_gpu_t *gpu)
{
destroy_dynamic_vidmem_mapping(gpu);
destroy_static_vidmem_mapping(gpu);
destroy_dynamic_sysmem_mapping(gpu);
destroy_static_vidmem_mapping(gpu);
}
uvm_gpu_address_t uvm_mmu_gpu_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
{
if (phys_addr.aperture == UVM_APERTURE_VID && !gpu->parent->ce_phys_vidmem_write_supported)
return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address);
return uvm_gpu_address_from_phys(phys_addr);
}
NV_STATUS uvm_test_invalidate_tlb(UVM_TEST_INVALIDATE_TLB_PARAMS *params, struct file *filp)