570.124.04

This commit is contained in:
Bernhard Stoeckner
2025-02-27 17:32:23 +01:00
parent 81fe4fb417
commit 129479b1b7
141 changed files with 102245 additions and 100070 deletions

View File

@@ -29,6 +29,7 @@
#include <linux/nodemask.h>
#include <linux/mempolicy.h>
#include <linux/mmu_notifier.h>
#include <linux/topology.h>
#if UVM_HMM_RANGE_FAULT_SUPPORTED()
#include <linux/hmm.h>
@@ -291,6 +292,27 @@ static const struct mmu_interval_notifier_ops uvm_ats_notifier_ops =
#endif
static bool resident_policy_match(struct vm_area_struct *vma, int dst_nid, int src_nid)
{
#if defined(NV_MEMPOLICY_HAS_UNIFIED_NODES)
struct mempolicy *vma_policy = vma_policy(vma);
// TODO: Bug 4981209: When migrations between CPU numa nodes are supported,
// add (dst_nid != closest_cpu_numa_node) to allow migrations between CPU
// NUMA nodes when destination is the closest_cpu_numa_node.
if (vma_policy &&
node_isset(src_nid, vma_policy->nodes) &&
node_isset(dst_nid, vma_policy->nodes) &&
!cpumask_empty(cpumask_of_node(src_nid)) &&
!cpumask_empty(cpumask_of_node(dst_nid))) {
return true;
}
#endif
return false;
}
static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,
struct vm_area_struct *vma,
NvU64 base,
@@ -370,9 +392,23 @@ static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,
if (pfn & HMM_PFN_VALID) {
struct page *page = hmm_pfn_to_page(pfn);
int resident_node = page_to_nid(page);
if (page_to_nid(page) == ats_context->residency_node)
// Set the residency_mask if:
// - The page is already resident at the intended destination.
// or
// - If both the source and destination nodes are CPU nodes and
// source node is already in the list of preferred nodes for
// the vma. On multi-CPU NUMA node architectures, this avoids
// unnecessary migrations between CPU nodes. Since the
// specific ats_context->residency_node selected by
// ats_batch_select_residency() is just a guess among the list
// of preferred nodes, paying the cost of migration across the
// CPU preferred nodes in this case can't be justified.
if ((resident_node == ats_context->residency_node) ||
resident_policy_match(vma, ats_context->residency_node, resident_node)) {
uvm_page_mask_set(residency_mask, page_index);
}
ats_context->prefetch_state.first_touch = false;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2024 NVIDIA Corporation
Copyright (c) 2015-2025 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -35,6 +35,7 @@
#include "uvm_mmu.h"
#include "uvm_perf_heuristics.h"
#include "uvm_pmm_sysmem.h"
#include "uvm_pmm_gpu.h"
#include "uvm_migrate.h"
#include "uvm_gpu_access_counters.h"
#include "uvm_va_space_mm.h"
@@ -90,6 +91,8 @@ NV_STATUS uvm_global_init(void)
uvm_spin_lock_irqsave_init(&g_uvm_global.gpu_table_lock, UVM_LOCK_ORDER_LEAF);
uvm_mutex_init(&g_uvm_global.va_spaces.lock, UVM_LOCK_ORDER_VA_SPACES_LIST);
INIT_LIST_HEAD(&g_uvm_global.va_spaces.list);
uvm_mutex_init(&g_uvm_global.devmem_ranges.lock, UVM_LOCK_ORDER_LEAF);
INIT_LIST_HEAD(&g_uvm_global.devmem_ranges.list);
status = uvm_kvmalloc_init();
if (status != NV_OK) {
@@ -231,6 +234,7 @@ void uvm_global_exit(void)
uvm_va_policy_exit();
uvm_mem_global_exit();
uvm_pmm_sysmem_exit();
uvm_pmm_devmem_exit();
uvm_gpu_exit();
uvm_processor_mask_cache_exit();

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2024 NVIDIA Corporation
Copyright (c) 2015-2025 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -157,6 +157,12 @@ struct uvm_global_struct
// This field is set once during global initialization (uvm_global_init),
// and can be read afterwards without acquiring any locks.
bool conf_computing_enabled;
// List of all devmem ranges allocted on this GPU
struct {
uvm_mutex_t lock;
struct list_head list;
} devmem_ranges;
};
// Initialize global uvm state

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2024 NVIDIA Corporation
Copyright (c) 2015-2025 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -109,8 +109,10 @@ static void fill_parent_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo
// nvswitch is routed via physical pages, where the upper 13-bits of the
// 47-bit address space holds the routing information for each peer.
// Currently, this is limited to a 16GB framebuffer window size.
if (parent_gpu->nvswitch_info.is_nvswitch_connected)
if (parent_gpu->nvswitch_info.is_nvswitch_connected) {
parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_info->nvswitchMemoryWindowStart;
parent_gpu->nvswitch_info.egm_fabric_memory_window_start = gpu_info->nvswitchEgmMemoryWindowStart;
}
uvm_uuid_string(uuid_buffer, &parent_gpu->uuid);
snprintf(parent_gpu->name,
@@ -244,6 +246,7 @@ static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu)
if (!fb_info.bZeroFb) {
gpu->mem_info.size = ((NvU64)fb_info.heapSize + fb_info.reservedHeapSize) * 1024;
gpu->mem_info.max_allocatable_address = fb_info.maxAllocatableAddress;
gpu->mem_info.phys_start = (NvU64)fb_info.heapStart * 1024;
}
gpu->mem_info.max_vidmem_page_size = fb_info.maxVidmemPageSize;
@@ -568,6 +571,9 @@ static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
UVM_SEQ_OR_DBG_PRINT(s, "big_page_size %u\n", gpu->big_page.internal_size);
UVM_SEQ_OR_DBG_PRINT(s, "rm_va_base 0x%llx\n", gpu->parent->rm_va_base);
UVM_SEQ_OR_DBG_PRINT(s, "rm_va_size 0x%llx\n", gpu->parent->rm_va_size);
UVM_SEQ_OR_DBG_PRINT(s, "vidmem_start %llu (%llu MBs)\n",
gpu->mem_info.phys_start,
gpu->mem_info.phys_start / (1024 * 1024));
UVM_SEQ_OR_DBG_PRINT(s, "vidmem_size %llu (%llu MBs)\n",
gpu->mem_info.size,
gpu->mem_info.size / (1024 * 1024));
@@ -1361,6 +1367,7 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
const UvmGpuPlatformInfo *gpu_platform_info)
{
NV_STATUS status;
UvmGpuFbInfo fb_info = {0};
status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
gpu_info,
@@ -1384,8 +1391,15 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
parent_gpu->egm.local_peer_id = gpu_info->egmPeerId;
parent_gpu->egm.base_address = gpu_info->egmBaseAddr;
status = uvm_rm_locked_call(nvUvmInterfaceGetFbInfo(parent_gpu->rm_device, &fb_info));
if (status != NV_OK)
return status;
parent_gpu->sli_enabled = (gpu_info->subdeviceCount > 1);
if (!fb_info.bZeroFb)
parent_gpu->max_allocatable_address = fb_info.maxAllocatableAddress;
parent_gpu->virt_mode = gpu_info->virtMode;
if (parent_gpu->virt_mode == UVM_VIRT_MODE_LEGACY) {
UVM_ERR_PRINT("Failed to init GPU %s. UVM is not supported in legacy virtualization mode\n",
@@ -1419,6 +1433,14 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
uvm_mmu_init_gpu_chunk_sizes(parent_gpu);
status = uvm_pmm_devmem_init(parent_gpu);
if (status != NV_OK) {
UVM_ERR_PRINT("failed to intialize device private memory: %s, GPU %s\n",
nvstatusToString(status),
uvm_parent_gpu_name(parent_gpu));
return status;
}
status = uvm_ats_add_gpu(parent_gpu);
if (status != NV_OK) {
UVM_ERR_PRINT("uvm_ats_add_gpu failed: %s, GPU %s\n",
@@ -1667,6 +1689,7 @@ static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)
deinit_parent_procfs_files(parent_gpu);
uvm_pmm_devmem_deinit(parent_gpu);
uvm_ats_remove_gpu(parent_gpu);
UVM_ASSERT(atomic64_read(&parent_gpu->mapped_cpu_pages_size) == 0);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2024 NVIDIA Corporation
Copyright (c) 2015-2025 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -696,6 +696,11 @@ struct uvm_gpu_struct
// ZeroFB testing mode, this will be 0.
NvU64 size;
// Physical start of heap, for SMC enabled GPUs, this is useful to
// partition PMM, it is used by HMM to figure out the right translation
// between HMM ranges and PMM offsets.
NvU64 phys_start;
// Max (inclusive) physical address of this GPU's memory that the driver
// can allocate through PMM (PMA).
NvU64 max_allocatable_address;
@@ -1015,6 +1020,13 @@ struct uvm_parent_gpu_struct
// Do not read this field directly, use uvm_gpu_device_handle instead.
uvmGpuDeviceHandle rm_device;
// Total amount of physical memory available on the parent GPU.
NvU64 max_allocatable_address;
#if UVM_IS_CONFIG_HMM()
uvm_pmm_gpu_devmem_t *devmem;
#endif
// The physical address range addressable by the GPU
//
// The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to
@@ -1288,6 +1300,10 @@ struct uvm_parent_gpu_struct
// 47-bit fabric memory physical offset that peer gpus need to access
// to read a peer's memory
NvU64 fabric_memory_window_start;
// 47-bit fabric memory physical offset that peer gpus need to access
// to read remote EGM memory.
NvU64 egm_fabric_memory_window_start;
} nvswitch_info;
struct

View File

@@ -321,13 +321,17 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
{
uvm_range_tree_node_t *node;
uvm_va_block_t *va_block;
struct range range = gpu->pmm.devmem.pagemap.range;
unsigned long devmem_start;
unsigned long devmem_end;
unsigned long pfn;
bool retry;
if (!uvm_hmm_is_enabled(va_space))
return;
devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
devmem_end = devmem_start + gpu->mem_info.size;
if (mm)
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
@@ -341,7 +345,7 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
do {
retry = false;
for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
struct page *page = pfn_to_page(pfn);
UVM_ASSERT(is_device_private_page(page));
@@ -349,7 +353,7 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
// This check is racy because nothing stops the page being freed and
// even reused. That doesn't matter though - worst case the
// migration fails, we retry and find the va_space doesn't match.
if (page->zone_device_data == va_space)
if (uvm_pmm_devmem_page_to_va_space(page) == va_space)
if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
retry = true;
}
@@ -1713,7 +1717,7 @@ static void gpu_chunk_remove(uvm_va_block_t *va_block,
uvm_gpu_chunk_t *gpu_chunk;
uvm_gpu_id_t id;
id = uvm_pmm_devmem_page_to_gpu_id(page);
id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
gpu_state = uvm_va_block_gpu_state_get(va_block, id);
UVM_ASSERT(gpu_state);
@@ -1743,7 +1747,7 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
uvm_gpu_id_t id;
NV_STATUS status;
id = uvm_pmm_devmem_page_to_gpu_id(page);
id = uvm_gpu_chunk_get_gpu(uvm_pmm_devmem_page_to_chunk(page))->id;
gpu_state = uvm_va_block_gpu_state_get(va_block, id);
// It's possible that this is a fresh va_block we're trying to add an
@@ -1765,7 +1769,7 @@ static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
UVM_ASSERT(gpu_chunk->is_referenced);
UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space);
UVM_ASSERT(uvm_pmm_devmem_page_to_va_space(page) == va_block->hmm.va_space);
if (gpu_state->chunks[page_index] == gpu_chunk)
return NV_OK;
@@ -1992,7 +1996,7 @@ static void fill_dst_pfn(uvm_va_block_t *va_block,
hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
UVM_ASSERT(!page_count(dpage));
zone_device_page_init(dpage);
dpage->zone_device_data = va_block->hmm.va_space;
dpage->zone_device_data = gpu_chunk;
dst_pfns[page_index] = migrate_pfn(pfn);
}

View File

@@ -130,27 +130,12 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
NV_STATUS status = NV_OK;
NV_STATUS tracker_status;
uvm_prot_t prot = UVM_PROT_READ_WRITE_ATOMIC;
// Get the mask of unmapped pages because it will change after the
// first map operation
uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);
if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
// Do not map pages that are already resident on the CPU. This is in
// order to avoid breaking system-wide atomic operations on HMM. HMM's
// implementation of system-side atomic operations involves restricting
// mappings to one processor (CPU or a GPU) at a time. If we were to
// grant a GPU a mapping to system memory, this gets into trouble
// because, on the CPU side, Linux can silently upgrade PTE permissions
// (move from read-only, to read-write, without any MMU notifiers
// firing), thus breaking the model by allowing simultaneous read-write
// access from two separate processors. To avoid that, just don't map
// such pages at all, when migrating.
uvm_page_mask_andnot(&va_block_context->caller_page_mask,
&va_block_context->caller_page_mask,
uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
}
// Only map those pages that are not mapped anywhere else (likely due
// to a first touch or a migration). We pass
// UvmEventMapRemoteCauseInvalid since the destination processor of a
@@ -166,6 +151,31 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
if (status != NV_OK)
goto out;
if (uvm_va_block_is_hmm(va_block) && UVM_ID_IS_CPU(dest_id)) {
uvm_processor_id_t id;
// Do not atomically map pages that are resident on the CPU. This is in
// order to avoid breaking system-wide atomic operations on HMM. HMM's
// implementation of system-side atomic operations involves restricting
// mappings to one processor (CPU or a GPU) at a time. If we were to
// grant a GPU a mapping to system memory, this gets into trouble
// because, on the CPU side, Linux can silently upgrade PTE permissions
// (move from read-only, to read-write, without any MMU notifiers
// firing), thus breaking the model by allowing simultaneous read-write
// access from two separate processors. To avoid that, don't remote map
// such pages atomically, after migrating.
// Also note that HMM sets CPU mapping for resident pages so the mask
// of pages to be mapped needs to be recomputed without including the
// CPU mapping.
prot = UVM_PROT_READ_WRITE;
uvm_page_mask_region_fill(&va_block_context->caller_page_mask, region);
for_each_gpu_id_in_mask(id, &va_block->mapped) {
uvm_page_mask_andnot(&va_block_context->caller_page_mask,
&va_block_context->caller_page_mask,
uvm_va_block_map_mask_get(va_block, id));
}
}
// Add mappings for AccessedBy processors
//
// No mappings within this call will operate on dest_id, so we don't
@@ -176,7 +186,7 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
dest_id,
region,
&va_block_context->caller_page_mask,
UVM_PROT_READ_WRITE_ATOMIC,
prot,
NULL);
out:

View File

@@ -1409,11 +1409,13 @@ static bool thrashing_processors_have_fast_access_to(uvm_va_space_t *va_space,
uvm_processor_mask_set(fast_to, to);
}
else {
// Include registered SMC peers and the processor 'to'.
// Include all SMC peers and the processor 'to'.
// This includes SMC peers that are not registered.
// Since not-registered peers cannot be in page_thrashing->processors,
// the value of their respective bits in "fast_to" doesn't matter.
uvm_processor_mask_range_fill(fast_to,
uvm_gpu_id_from_sub_processor(uvm_parent_gpu_id_from_gpu_id(to), 0),
UVM_PARENT_ID_MAX_SUB_PROCESSORS);
uvm_processor_mask_and(fast_to, fast_to, &va_space->registered_gpu_va_spaces);
}
return uvm_processor_mask_subset(&page_thrashing->processors, fast_to);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2024 NVIDIA Corporation
Copyright (c) 2015-2025 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -3030,69 +3030,23 @@ NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region
#if UVM_IS_CONFIG_HMM()
static uvm_pmm_gpu_t *devmem_page_to_pmm(struct page *page)
{
return container_of(page->pgmap, uvm_pmm_gpu_t, devmem.pagemap);
}
static uvm_gpu_chunk_t *devmem_page_to_chunk_locked(struct page *page)
{
uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
NvU64 chunk_addr = ((NvU64)page_to_pfn(page) << PAGE_SHIFT) - pmm->devmem.pagemap.range.start;
size_t index = chunk_addr / UVM_CHUNK_SIZE_MAX;
uvm_gpu_chunk_t *root_chunk;
uvm_gpu_chunk_t *chunk;
uvm_gpu_chunk_t *parent;
uvm_chunk_size_t chunk_size;
UVM_ASSERT(index < pmm->root_chunks.count);
root_chunk = &pmm->root_chunks.array[index].chunk;
UVM_ASSERT(root_chunk->address == UVM_ALIGN_DOWN(chunk_addr, UVM_CHUNK_SIZE_MAX));
// Find the uvm_gpu_chunk_t that corresponds to the device private struct
// page's PFN. The loop is only 0, 1, or 2 iterations.
for (chunk = root_chunk;
uvm_gpu_chunk_get_size(chunk) != page_size(page);
chunk = parent->suballoc->subchunks[index]) {
parent = chunk;
UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
UVM_ASSERT(parent->suballoc);
chunk_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
index = (size_t)uvm_div_pow2_64(chunk_addr - parent->address, chunk_size);
UVM_ASSERT(index < num_subchunks(parent));
}
UVM_ASSERT(chunk->address = chunk_addr);
UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
UVM_ASSERT(chunk->is_referenced);
return chunk;
}
uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page)
{
uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
uvm_gpu_chunk_t *chunk;
UVM_ASSERT(is_device_private_page(page));
uvm_spin_lock(&pmm->list_lock);
chunk = devmem_page_to_chunk_locked(page);
uvm_spin_unlock(&pmm->list_lock);
return chunk;
return page->zone_device_data;
}
uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
uvm_va_space_t *uvm_pmm_devmem_page_to_va_space(struct page *page)
{
uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
uvm_gpu_chunk_t *gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
UVM_ASSERT(is_device_private_page(page));
// uvm_hmm_unregister_gpu() needs to do a racy check here so
// page->zone_device_data might be NULL.
if (!gpu_chunk || !gpu_chunk->va_block)
return NULL;
return gpu->id;
return gpu_chunk->va_block->hmm.va_space;
}
// Check there are no orphan pages. This should be only called as part of
@@ -3104,12 +3058,17 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
{
size_t i;
bool ret = true;
uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
unsigned long devmem_start;
unsigned long devmem_end;
unsigned long pfn;
struct range range = pmm->devmem.pagemap.range;
if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
return ret;
devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
devmem_end = devmem_start + gpu->mem_info.size;
// Scan all the root chunks looking for subchunks which are still
// referenced.
for (i = 0; i < pmm->root_chunks.count; i++) {
@@ -3121,7 +3080,7 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
root_chunk_unlock(pmm, root_chunk);
}
for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
struct page *page = pfn_to_page(pfn);
if (!is_device_private_page(page)) {
@@ -3140,9 +3099,8 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
static void devmem_page_free(struct page *page)
{
uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
uvm_gpu_chunk_t *chunk;
uvm_gpu_chunk_t *chunk = uvm_pmm_devmem_page_to_chunk(page);
uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(chunk);
page->zone_device_data = NULL;
@@ -3150,23 +3108,22 @@ static void devmem_page_free(struct page *page)
// we may be in an interrupt context where we can't do that. Instead,
// do a lazy free. Note that we have to use a "normal" spin lock because
// the UVM context is not available.
spin_lock(&pmm->list_lock.lock);
spin_lock(&gpu->pmm.list_lock.lock);
chunk = devmem_page_to_chunk_locked(page);
UVM_ASSERT(chunk->is_referenced);
chunk->is_referenced = false;
list_add_tail(&chunk->list, &pmm->root_chunks.va_block_lazy_free);
list_add_tail(&chunk->list, &gpu->pmm.root_chunks.va_block_lazy_free);
spin_unlock(&pmm->list_lock.lock);
spin_unlock(&gpu->pmm.list_lock.lock);
nv_kthread_q_schedule_q_item(&gpu->parent->lazy_free_q,
&pmm->root_chunks.va_block_lazy_free_q_item);
&gpu->pmm.root_chunks.va_block_lazy_free_q_item);
}
// This is called by HMM when the CPU faults on a ZONE_DEVICE private entry.
static vm_fault_t devmem_fault(struct vm_fault *vmf)
{
uvm_va_space_t *va_space = vmf->page->zone_device_data;
uvm_va_space_t *va_space = uvm_pmm_devmem_page_to_va_space(vmf->page);
if (!va_space)
return VM_FAULT_SIGBUS;
@@ -3185,26 +3142,46 @@ static const struct dev_pagemap_ops uvm_pmm_devmem_ops =
.migrate_to_ram = devmem_fault_entry,
};
static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
// Allocating and initialising device private pages takes a significant amount
// of time on very large systems. So rather than do that everytime a GPU is
// registered we do it once and keep track of the range when the GPU is
// unregistered for later reuse.
//
// This function tries to find an exsiting range of device private pages and if
// available allocates and returns it for reuse.
static uvm_pmm_gpu_devmem_t *devmem_reuse_pagemap(unsigned long size)
{
unsigned long size = pmm->root_chunks.count * UVM_CHUNK_SIZE_MAX;
uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
uvm_pmm_gpu_devmem_t *devmem;
list_for_each_entry(devmem, &g_uvm_global.devmem_ranges.list, list_node) {
if (devmem->size == size) {
list_del(&devmem->list_node);
return devmem;
}
}
return NULL;
}
static uvm_pmm_gpu_devmem_t *devmem_alloc_pagemap(unsigned long size)
{
uvm_pmm_gpu_devmem_t *devmem;
struct resource *res;
void *ptr;
NV_STATUS status;
if (!uvm_hmm_is_enabled_system_wide()) {
devmem->pagemap.owner = NULL;
return NV_OK;
}
res = request_free_mem_region(&iomem_resource, size, "nvidia-uvm-hmm");
if (IS_ERR(res)) {
UVM_ERR_PRINT("request_free_mem_region() err %ld\n", PTR_ERR(res));
status = errno_to_nv_status(PTR_ERR(res));
goto err;
return NULL;
}
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
goto err;
devmem->size = size;
devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
devmem->pagemap.range.start = res->start;
devmem->pagemap.range.end = res->end;
@@ -3217,43 +3194,77 @@ static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
if (IS_ERR(ptr)) {
UVM_ERR_PRINT("memremap_pages() err %ld\n", PTR_ERR(ptr));
status = errno_to_nv_status(PTR_ERR(ptr));
goto err_release;
goto err_free;
}
return NV_OK;
return devmem;
err_free:
kfree(devmem);
err_release:
release_mem_region(res->start, resource_size(res));
err:
devmem->pagemap.owner = NULL;
return status;
release_mem_region(res->start, resource_size(res));
return NULL;
}
static void devmem_deinit(uvm_pmm_gpu_t *pmm)
NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu)
{
uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
// Create a DEVICE_PRIVATE page for every GPU page available on the parent.
unsigned long size = gpu->max_allocatable_address;
if (!devmem->pagemap.owner)
if (!uvm_hmm_is_enabled_system_wide()) {
gpu->devmem = NULL;
return NV_OK;
}
gpu->devmem = devmem_reuse_pagemap(size);
if (!gpu->devmem)
gpu->devmem = devmem_alloc_pagemap(size);
if (!gpu->devmem)
return NV_ERR_NO_MEMORY;
return NV_OK;
}
void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *gpu)
{
if (!gpu->devmem)
return;
memunmap_pages(&devmem->pagemap);
release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
list_add_tail(&gpu->devmem->list_node, &g_uvm_global.devmem_ranges.list);
gpu->devmem = NULL;
}
void uvm_pmm_devmem_exit(void)
{
uvm_pmm_gpu_devmem_t *devmem, *devmem_next;
list_for_each_entry_safe(devmem, devmem_next, &g_uvm_global.devmem_ranges.list, list_node) {
list_del(&devmem->list_node);
memunmap_pages(&devmem->pagemap);
release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
kfree(devmem);
}
}
unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
{
return (pmm->devmem.pagemap.range.start + chunk->address) >> PAGE_SHIFT;
uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
unsigned long devmem_start = gpu->parent->devmem->pagemap.range.start;
return (devmem_start + chunk->address) >> PAGE_SHIFT;
}
#endif // UVM_IS_CONFIG_HMM()
#if !UVM_IS_CONFIG_HMM()
static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu)
{
return NV_OK;
}
static void devmem_deinit(uvm_pmm_gpu_t *pmm)
void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *gpu)
{
}
@@ -3469,10 +3480,6 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
}
}
status = devmem_init(pmm);
if (status != NV_OK)
goto cleanup;
return NV_OK;
cleanup:
uvm_pmm_gpu_deinit(pmm);
@@ -3543,8 +3550,6 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
deinit_caches(pmm);
devmem_deinit(pmm);
pmm->initialized = false;
}

View File

@@ -192,22 +192,41 @@ typedef struct uvm_pmm_gpu_chunk_suballoc_struct uvm_pmm_gpu_chunk_suballoc_t;
#if UVM_IS_CONFIG_HMM()
typedef struct uvm_pmm_gpu_struct uvm_pmm_gpu_t;
typedef struct
{
// For g_uvm_global.devmem_ranges
struct list_head list_node;
// Size that was requested when created this region. This may be less than
// the size actually allocated by the kernel due to alignment contraints.
// Figuring out the required alignment at compile time is difficult due to
// unexported macros, so just use the requested size as the search key.
unsigned long size;
struct dev_pagemap pagemap;
} uvm_pmm_gpu_devmem_t;
typedef struct uvm_pmm_gpu_struct uvm_pmm_gpu_t;
// Return the GPU chunk for a given device private struct page.
uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page);
// Return the va_space for a given device private struct page.
uvm_va_space_t *uvm_pmm_devmem_page_to_va_space(struct page *page);
// Return the GPU id for a given device private struct page.
uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page);
// Return the PFN of the device private struct page for the given GPU chunk.
unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
// Free unused ZONE_DEVICE pages.
void uvm_pmm_devmem_exit(void);
#else
static inline void uvm_pmm_devmem_exit(void)
{
}
#endif
#if defined(CONFIG_PCI_P2PDMA) && defined(NV_STRUCT_PAGE_HAS_ZONE_DEVICE_DATA)
@@ -349,10 +368,6 @@ typedef struct uvm_pmm_gpu_struct
nv_kthread_q_item_t va_block_lazy_free_q_item;
} root_chunks;
#if UVM_IS_CONFIG_HMM()
uvm_pmm_gpu_devmem_t devmem;
#endif
// Lock protecting PMA allocation, freeing and eviction
uvm_rw_semaphore_t pma_lock;
@@ -604,6 +619,10 @@ static uvm_chunk_size_t uvm_chunk_find_prev_size(uvm_chunk_sizes_mask_t chunk_si
// retained, and it's up to the caller to release them.
NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region_size, uvm_reverse_map_t *out_mappings);
// Allocate and initialise struct page data in the kernel to support HMM.
NV_STATUS uvm_pmm_devmem_init(uvm_parent_gpu_t *gpu);
void uvm_pmm_devmem_deinit(uvm_parent_gpu_t *parent_gpu);
// Iterates over every size in the input mask from smallest to largest
#define for_each_chunk_size(__size, __chunk_sizes) \
for ((__size) = (__chunk_sizes) ? uvm_chunk_find_first_size(__chunk_sizes) : \

View File

@@ -2839,10 +2839,14 @@ static bool block_check_egm_peer(uvm_va_space_t *va_space, uvm_gpu_t *gpu, int n
remote_node_info = uvm_va_space_get_egm_numa_node_info(va_space, nid);
UVM_ASSERT(!uvm_parent_processor_mask_empty(&remote_node_info->parent_gpus));
for_each_parent_gpu_in_mask(parent_gpu, &remote_node_info->parent_gpus) {
UVM_ASSERT(parent_gpu->egm.enabled);
NvU64 page_addr = phys_addr.address;
if (phys_addr.address + parent_gpu->egm.base_address >= remote_node_info->node_start &&
phys_addr.address + parent_gpu->egm.base_address < remote_node_info->node_end &&
UVM_ASSERT(parent_gpu->egm.enabled);
page_addr += parent_gpu->egm.base_address;
if (parent_gpu->nvswitch_info.is_nvswitch_connected && gpu->parent != parent_gpu)
page_addr -= parent_gpu->nvswitch_info.egm_fabric_memory_window_start;
if (page_addr >= remote_node_info->node_start && page_addr < remote_node_info->node_end &&
remote_node_info->routing_table[uvm_parent_id_gpu_index(gpu->parent->id)] == parent_gpu) {
return true;
}
@@ -3229,8 +3233,15 @@ static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
if (routing_gpu) {
struct page *page = uvm_cpu_chunk_get_cpu_page(block, chunk, block_page.page_index);
phys_addr = page_to_phys(page);
aperture = uvm_gpu_egm_peer_aperture(gpu->parent, routing_gpu);
// Remote EGM routing is based on both the EGM base address and EGM
// fabric memory window.
if (routing_gpu->nvswitch_info.is_nvswitch_connected && routing_gpu != gpu->parent)
phys_addr += routing_gpu->nvswitch_info.egm_fabric_memory_window_start;
uvm_page_mask_set(&accessing_gpu_state->egm_pages, block_page.page_index);
return uvm_gpu_phys_address(aperture, phys_addr - routing_gpu->egm.base_address);
}
@@ -13575,6 +13586,9 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
struct page *page = block_page_get(block, block_page);
phys_addr = page_to_phys(page) - egm_routing_gpu->egm.base_address;
if (egm_routing_gpu->nvswitch_info.is_nvswitch_connected && egm_routing_gpu != gpu->parent)
phys_addr += egm_routing_gpu->nvswitch_info.egm_fabric_memory_window_start;
params->is_egm_mapping[count] = true;
}
}