mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2026-03-03 12:20:11 +00:00
525.53
This commit is contained in:
@@ -30,31 +30,6 @@
|
||||
#include "uvm_va_range.h"
|
||||
#include "uvm_test.h"
|
||||
|
||||
// Global cache to allocate the per-VA block prefetch detection structures
|
||||
static struct kmem_cache *g_prefetch_info_cache __read_mostly;
|
||||
|
||||
// Per-VA block prefetch detection structure
|
||||
typedef struct
|
||||
{
|
||||
uvm_page_mask_t prefetch_pages;
|
||||
|
||||
uvm_page_mask_t migrate_pages;
|
||||
|
||||
uvm_va_block_bitmap_tree_t bitmap_tree;
|
||||
|
||||
uvm_processor_id_t last_migration_proc_id;
|
||||
|
||||
uvm_va_block_region_t region;
|
||||
|
||||
size_t big_page_size;
|
||||
|
||||
uvm_va_block_region_t big_pages_region;
|
||||
|
||||
NvU16 pending_prefetch_pages;
|
||||
|
||||
NvU16 fault_migrations_to_last_proc;
|
||||
} block_prefetch_info_t;
|
||||
|
||||
//
|
||||
// Tunables for prefetch detection/prevention (configurable via module parameters)
|
||||
//
|
||||
@@ -88,19 +63,54 @@ static bool g_uvm_perf_prefetch_enable;
|
||||
static unsigned g_uvm_perf_prefetch_threshold;
|
||||
static unsigned g_uvm_perf_prefetch_min_faults;
|
||||
|
||||
// Callback declaration for the performance heuristics events
|
||||
static void prefetch_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
|
||||
void uvm_perf_prefetch_bitmap_tree_iter_init(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
uvm_page_index_t page_index,
|
||||
uvm_perf_prefetch_bitmap_tree_iter_t *iter)
|
||||
{
|
||||
UVM_ASSERT(bitmap_tree->level_count > 0);
|
||||
UVM_ASSERT_MSG(page_index < bitmap_tree->leaf_count,
|
||||
"%zd vs %zd",
|
||||
(size_t)page_index,
|
||||
(size_t)bitmap_tree->leaf_count);
|
||||
|
||||
static uvm_va_block_region_t compute_prefetch_region(uvm_page_index_t page_index, block_prefetch_info_t *prefetch_info)
|
||||
iter->level_idx = bitmap_tree->level_count - 1;
|
||||
iter->node_idx = page_index;
|
||||
}
|
||||
|
||||
uvm_va_block_region_t uvm_perf_prefetch_bitmap_tree_iter_get_range(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
const uvm_perf_prefetch_bitmap_tree_iter_t *iter)
|
||||
{
|
||||
NvU16 range_leaves = uvm_perf_tree_iter_leaf_range(bitmap_tree, iter);
|
||||
NvU16 range_start = uvm_perf_tree_iter_leaf_range_start(bitmap_tree, iter);
|
||||
uvm_va_block_region_t subregion = uvm_va_block_region(range_start, range_start + range_leaves);
|
||||
|
||||
UVM_ASSERT(iter->level_idx >= 0);
|
||||
UVM_ASSERT(iter->level_idx < bitmap_tree->level_count);
|
||||
|
||||
return subregion;
|
||||
}
|
||||
|
||||
NvU16 uvm_perf_prefetch_bitmap_tree_iter_get_count(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
const uvm_perf_prefetch_bitmap_tree_iter_t *iter)
|
||||
{
|
||||
uvm_va_block_region_t subregion = uvm_perf_prefetch_bitmap_tree_iter_get_range(bitmap_tree, iter);
|
||||
|
||||
return uvm_page_mask_region_weight(&bitmap_tree->pages, subregion);
|
||||
}
|
||||
|
||||
static uvm_va_block_region_t compute_prefetch_region(uvm_page_index_t page_index,
|
||||
uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
uvm_va_block_region_t max_prefetch_region)
|
||||
{
|
||||
NvU16 counter;
|
||||
uvm_va_block_bitmap_tree_iter_t iter;
|
||||
uvm_va_block_bitmap_tree_t *bitmap_tree = &prefetch_info->bitmap_tree;
|
||||
uvm_va_block_region_t prefetch_region = uvm_va_block_region(bitmap_tree->leaf_count,
|
||||
bitmap_tree->leaf_count + 1);
|
||||
uvm_perf_prefetch_bitmap_tree_iter_t iter;
|
||||
uvm_va_block_region_t prefetch_region = uvm_va_block_region(0, 0);
|
||||
|
||||
uvm_va_block_bitmap_tree_traverse_counters(counter, bitmap_tree, page_index, &iter) {
|
||||
uvm_va_block_region_t subregion = uvm_va_block_bitmap_tree_iter_get_range(bitmap_tree, &iter);
|
||||
uvm_perf_prefetch_bitmap_tree_traverse_counters(counter,
|
||||
bitmap_tree,
|
||||
page_index - max_prefetch_region.first + bitmap_tree->offset,
|
||||
&iter) {
|
||||
uvm_va_block_region_t subregion = uvm_perf_prefetch_bitmap_tree_iter_get_range(bitmap_tree, &iter);
|
||||
NvU16 subregion_pages = uvm_va_block_region_num_pages(subregion);
|
||||
|
||||
UVM_ASSERT(counter <= subregion_pages);
|
||||
@@ -109,289 +119,287 @@ static uvm_va_block_region_t compute_prefetch_region(uvm_page_index_t page_index
|
||||
}
|
||||
|
||||
// Clamp prefetch region to actual pages
|
||||
if (prefetch_region.first < bitmap_tree->leaf_count) {
|
||||
if (prefetch_region.first < prefetch_info->region.first)
|
||||
prefetch_region.first = prefetch_info->region.first;
|
||||
if (prefetch_region.outer) {
|
||||
prefetch_region.first += max_prefetch_region.first;
|
||||
if (prefetch_region.first < bitmap_tree->offset) {
|
||||
prefetch_region.first = bitmap_tree->offset;
|
||||
}
|
||||
else {
|
||||
prefetch_region.first -= bitmap_tree->offset;
|
||||
if (prefetch_region.first < max_prefetch_region.first)
|
||||
prefetch_region.first = max_prefetch_region.first;
|
||||
}
|
||||
|
||||
if (prefetch_region.outer > prefetch_info->region.outer)
|
||||
prefetch_region.outer = prefetch_info->region.outer;
|
||||
prefetch_region.outer += max_prefetch_region.first;
|
||||
if (prefetch_region.outer < bitmap_tree->offset) {
|
||||
prefetch_region.outer = bitmap_tree->offset;
|
||||
}
|
||||
else {
|
||||
prefetch_region.outer -= bitmap_tree->offset;
|
||||
if (prefetch_region.outer > max_prefetch_region.outer)
|
||||
prefetch_region.outer = max_prefetch_region.outer;
|
||||
}
|
||||
}
|
||||
|
||||
return prefetch_region;
|
||||
}
|
||||
|
||||
// Performance heuristics module for prefetch
|
||||
static uvm_perf_module_t g_module_prefetch;
|
||||
|
||||
static uvm_perf_module_event_callback_desc_t g_callbacks_prefetch[] = {
|
||||
{ UVM_PERF_EVENT_BLOCK_DESTROY, prefetch_block_destroy_cb },
|
||||
{ UVM_PERF_EVENT_MODULE_UNLOAD, prefetch_block_destroy_cb },
|
||||
{ UVM_PERF_EVENT_BLOCK_SHRINK, prefetch_block_destroy_cb }
|
||||
};
|
||||
|
||||
// Get the prefetch detection struct for the given block
|
||||
static block_prefetch_info_t *prefetch_info_get(uvm_va_block_t *va_block)
|
||||
{
|
||||
return uvm_perf_module_type_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_PREFETCH);
|
||||
}
|
||||
|
||||
static void prefetch_info_destroy(uvm_va_block_t *va_block)
|
||||
{
|
||||
block_prefetch_info_t *prefetch_info = prefetch_info_get(va_block);
|
||||
if (prefetch_info) {
|
||||
kmem_cache_free(g_prefetch_info_cache, prefetch_info);
|
||||
uvm_perf_module_type_unset_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_PREFETCH);
|
||||
}
|
||||
}
|
||||
|
||||
// Get the prefetch detection struct for the given block or create it if it
|
||||
// does not exist
|
||||
static block_prefetch_info_t *prefetch_info_get_create(uvm_va_block_t *va_block)
|
||||
{
|
||||
block_prefetch_info_t *prefetch_info = prefetch_info_get(va_block);
|
||||
if (!prefetch_info) {
|
||||
// Create some ghost leaves so we can align the tree to big page boundary. We use the
|
||||
// largest page size to handle the worst-case scenario
|
||||
size_t big_page_size = UVM_PAGE_SIZE_128K;
|
||||
uvm_va_block_region_t big_pages_region = uvm_va_block_big_page_region_all(va_block, big_page_size);
|
||||
size_t num_leaves = uvm_va_block_num_cpu_pages(va_block);
|
||||
|
||||
// If the va block is not big enough to fit 128KB pages, maybe it still can fit 64KB pages
|
||||
if (big_pages_region.outer == 0) {
|
||||
big_page_size = UVM_PAGE_SIZE_64K;
|
||||
big_pages_region = uvm_va_block_big_page_region_all(va_block, big_page_size);
|
||||
}
|
||||
|
||||
if (big_pages_region.first > 0)
|
||||
num_leaves += (big_page_size / PAGE_SIZE - big_pages_region.first);
|
||||
|
||||
UVM_ASSERT(num_leaves <= PAGES_PER_UVM_VA_BLOCK);
|
||||
|
||||
prefetch_info = nv_kmem_cache_zalloc(g_prefetch_info_cache, NV_UVM_GFP_FLAGS);
|
||||
if (!prefetch_info)
|
||||
goto fail;
|
||||
|
||||
prefetch_info->last_migration_proc_id = UVM_ID_INVALID;
|
||||
|
||||
uvm_va_block_bitmap_tree_init_from_page_count(&prefetch_info->bitmap_tree, num_leaves);
|
||||
|
||||
uvm_perf_module_type_set_data(va_block->perf_modules_data, prefetch_info, UVM_PERF_MODULE_TYPE_PREFETCH);
|
||||
}
|
||||
|
||||
return prefetch_info;
|
||||
|
||||
fail:
|
||||
prefetch_info_destroy(va_block);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void grow_fault_granularity_if_no_thrashing(block_prefetch_info_t *prefetch_info,
|
||||
static void grow_fault_granularity_if_no_thrashing(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
uvm_va_block_region_t region,
|
||||
uvm_page_index_t first,
|
||||
const uvm_page_mask_t *faulted_pages,
|
||||
const uvm_page_mask_t *thrashing_pages)
|
||||
{
|
||||
if (!uvm_page_mask_region_empty(faulted_pages, region) &&
|
||||
(!thrashing_pages || uvm_page_mask_region_empty(thrashing_pages, region))) {
|
||||
region.first += prefetch_info->region.first;
|
||||
region.outer += prefetch_info->region.first;
|
||||
uvm_page_mask_region_fill(&prefetch_info->bitmap_tree.pages, region);
|
||||
UVM_ASSERT(region.first >= first);
|
||||
region.first = region.first - first + bitmap_tree->offset;
|
||||
region.outer = region.outer - first + bitmap_tree->offset;
|
||||
UVM_ASSERT(region.outer <= bitmap_tree->leaf_count);
|
||||
uvm_page_mask_region_fill(&bitmap_tree->pages, region);
|
||||
}
|
||||
}
|
||||
|
||||
static void grow_fault_granularity(uvm_va_block_t *va_block,
|
||||
block_prefetch_info_t *prefetch_info,
|
||||
static void grow_fault_granularity(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
NvU32 big_page_size,
|
||||
uvm_va_block_region_t big_pages_region,
|
||||
uvm_va_block_region_t max_prefetch_region,
|
||||
const uvm_page_mask_t *faulted_pages,
|
||||
const uvm_page_mask_t *thrashing_pages)
|
||||
{
|
||||
size_t num_big_pages;
|
||||
size_t big_page_index;
|
||||
uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
|
||||
uvm_page_index_t pages_per_big_page = big_page_size / PAGE_SIZE;
|
||||
uvm_page_index_t page_index;
|
||||
|
||||
// Migrate whole block if no big pages and no page in it is thrashing
|
||||
if (!big_pages_region.outer) {
|
||||
grow_fault_granularity_if_no_thrashing(bitmap_tree,
|
||||
max_prefetch_region,
|
||||
max_prefetch_region.first,
|
||||
faulted_pages,
|
||||
thrashing_pages);
|
||||
return;
|
||||
}
|
||||
|
||||
// Migrate whole "prefix" if no page in it is thrashing
|
||||
if (prefetch_info->big_pages_region.first > 0) {
|
||||
uvm_va_block_region_t prefix_region = uvm_va_block_region(0, prefetch_info->big_pages_region.first);
|
||||
if (big_pages_region.first > max_prefetch_region.first) {
|
||||
uvm_va_block_region_t prefix_region = uvm_va_block_region(max_prefetch_region.first, big_pages_region.first);
|
||||
|
||||
grow_fault_granularity_if_no_thrashing(prefetch_info, prefix_region, faulted_pages, thrashing_pages);
|
||||
grow_fault_granularity_if_no_thrashing(bitmap_tree,
|
||||
prefix_region,
|
||||
max_prefetch_region.first,
|
||||
faulted_pages,
|
||||
thrashing_pages);
|
||||
}
|
||||
|
||||
// Migrate whole big pages if they are not thrashing
|
||||
num_big_pages = uvm_va_block_num_big_pages(va_block, prefetch_info->big_page_size);
|
||||
for (big_page_index = 0; big_page_index < num_big_pages; ++big_page_index) {
|
||||
uvm_va_block_region_t big_region = uvm_va_block_big_page_region(va_block,
|
||||
big_page_index,
|
||||
prefetch_info->big_page_size);
|
||||
for (page_index = big_pages_region.first;
|
||||
page_index < big_pages_region.outer;
|
||||
page_index += pages_per_big_page) {
|
||||
uvm_va_block_region_t big_region = uvm_va_block_region(page_index,
|
||||
page_index + pages_per_big_page);
|
||||
|
||||
grow_fault_granularity_if_no_thrashing(prefetch_info, big_region, faulted_pages, thrashing_pages);
|
||||
grow_fault_granularity_if_no_thrashing(bitmap_tree,
|
||||
big_region,
|
||||
max_prefetch_region.first,
|
||||
faulted_pages,
|
||||
thrashing_pages);
|
||||
}
|
||||
|
||||
// Migrate whole "suffix" if no page in it is thrashing
|
||||
if (prefetch_info->big_pages_region.outer < block_region.outer) {
|
||||
uvm_va_block_region_t suffix_region = uvm_va_block_region(prefetch_info->big_pages_region.outer,
|
||||
block_region.outer);
|
||||
if (big_pages_region.outer < max_prefetch_region.outer) {
|
||||
uvm_va_block_region_t suffix_region = uvm_va_block_region(big_pages_region.outer,
|
||||
max_prefetch_region.outer);
|
||||
|
||||
grow_fault_granularity_if_no_thrashing(prefetch_info, suffix_region, faulted_pages, thrashing_pages);
|
||||
grow_fault_granularity_if_no_thrashing(bitmap_tree,
|
||||
suffix_region,
|
||||
max_prefetch_region.first,
|
||||
faulted_pages,
|
||||
thrashing_pages);
|
||||
}
|
||||
}
|
||||
|
||||
// Within a block we only allow prefetching to a single processor. Therefore, if two processors
|
||||
// are accessing non-overlapping regions within the same block they won't benefit from
|
||||
// prefetching.
|
||||
// Within a block we only allow prefetching to a single processor. Therefore,
|
||||
// if two processors are accessing non-overlapping regions within the same
|
||||
// block they won't benefit from prefetching.
|
||||
//
|
||||
// TODO: Bug 1778034: [uvm] Explore prefetching to different processors within a VA block
|
||||
void uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_block,
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
uvm_processor_id_t new_residency,
|
||||
const uvm_page_mask_t *faulted_pages,
|
||||
uvm_va_block_region_t region)
|
||||
// TODO: Bug 1778034: [uvm] Explore prefetching to different processors within
|
||||
// a VA block.
|
||||
static NvU32 uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_block,
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
uvm_processor_id_t new_residency,
|
||||
const uvm_page_mask_t *faulted_pages,
|
||||
uvm_va_block_region_t faulted_region,
|
||||
uvm_page_mask_t *prefetch_pages,
|
||||
uvm_perf_prefetch_bitmap_tree_t *bitmap_tree)
|
||||
{
|
||||
uvm_page_index_t page_index;
|
||||
block_prefetch_info_t *prefetch_info;
|
||||
const uvm_page_mask_t *resident_mask = NULL;
|
||||
const uvm_page_mask_t *thrashing_pages = NULL;
|
||||
uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
|
||||
uvm_va_policy_t *policy = va_block_context->policy;
|
||||
uvm_va_block_region_t max_prefetch_region;
|
||||
NvU32 big_page_size;
|
||||
uvm_va_block_region_t big_pages_region;
|
||||
|
||||
uvm_assert_rwsem_locked(&va_space->lock);
|
||||
|
||||
if (!g_uvm_perf_prefetch_enable)
|
||||
return;
|
||||
|
||||
prefetch_info = prefetch_info_get_create(va_block);
|
||||
if (!prefetch_info)
|
||||
return;
|
||||
|
||||
if (!uvm_id_equal(prefetch_info->last_migration_proc_id, new_residency)) {
|
||||
prefetch_info->last_migration_proc_id = new_residency;
|
||||
prefetch_info->fault_migrations_to_last_proc = 0;
|
||||
if (!uvm_id_equal(va_block->prefetch_info.last_migration_proc_id, new_residency)) {
|
||||
va_block->prefetch_info.last_migration_proc_id = new_residency;
|
||||
va_block->prefetch_info.fault_migrations_to_last_proc = 0;
|
||||
}
|
||||
|
||||
prefetch_info->pending_prefetch_pages = 0;
|
||||
// Compute the expanded region that prefetching is allowed from.
|
||||
if (uvm_va_block_is_hmm(va_block)) {
|
||||
max_prefetch_region = uvm_hmm_get_prefetch_region(va_block,
|
||||
va_block_context,
|
||||
uvm_va_block_region_start(va_block, faulted_region));
|
||||
}
|
||||
else {
|
||||
max_prefetch_region = uvm_va_block_region_from_block(va_block);
|
||||
}
|
||||
|
||||
uvm_page_mask_zero(prefetch_pages);
|
||||
|
||||
if (UVM_ID_IS_CPU(new_residency) || va_block->gpus[uvm_id_gpu_index(new_residency)] != NULL)
|
||||
resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency);
|
||||
|
||||
// If this is a first-touch fault and the destination processor is the
|
||||
// preferred location, populate the whole VA block
|
||||
// preferred location, populate the whole max_prefetch_region.
|
||||
if (uvm_processor_mask_empty(&va_block->resident) &&
|
||||
uvm_id_equal(new_residency, policy->preferred_location)) {
|
||||
uvm_page_mask_region_fill(&prefetch_info->prefetch_pages, uvm_va_block_region_from_block(va_block));
|
||||
uvm_page_mask_region_fill(prefetch_pages, max_prefetch_region);
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (resident_mask)
|
||||
uvm_page_mask_or(&prefetch_info->bitmap_tree.pages, resident_mask, faulted_pages);
|
||||
uvm_page_mask_or(&bitmap_tree->pages, resident_mask, faulted_pages);
|
||||
else
|
||||
uvm_page_mask_copy(&prefetch_info->bitmap_tree.pages, faulted_pages);
|
||||
uvm_page_mask_copy(&bitmap_tree->pages, faulted_pages);
|
||||
|
||||
// Get the big page size for the new residency
|
||||
// If we are using a subregion of the va_block, align bitmap_tree
|
||||
uvm_page_mask_shift_right(&bitmap_tree->pages, &bitmap_tree->pages, max_prefetch_region.first);
|
||||
|
||||
// Get the big page size for the new residency.
|
||||
// Assume 64K size if the new residency is the CPU or no GPU va space is
|
||||
// registered in the current process for this GPU.
|
||||
if (UVM_ID_IS_GPU(new_residency) &&
|
||||
uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, new_residency)) {
|
||||
uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, new_residency);
|
||||
prefetch_info->big_page_size = uvm_va_block_gpu_big_page_size(va_block, gpu);
|
||||
|
||||
big_page_size = uvm_va_block_gpu_big_page_size(va_block, gpu);
|
||||
}
|
||||
else {
|
||||
prefetch_info->big_page_size = UVM_PAGE_SIZE_64K;
|
||||
big_page_size = UVM_PAGE_SIZE_64K;
|
||||
}
|
||||
|
||||
big_pages_region = uvm_va_block_big_page_region_subset(va_block, max_prefetch_region, big_page_size);
|
||||
|
||||
// Adjust the prefetch tree to big page granularity to make sure that we
|
||||
// get big page-friendly prefetching hints
|
||||
prefetch_info->big_pages_region = uvm_va_block_big_page_region_all(va_block, prefetch_info->big_page_size);
|
||||
if (prefetch_info->big_pages_region.first > 0) {
|
||||
prefetch_info->region.first = prefetch_info->big_page_size / PAGE_SIZE - prefetch_info->big_pages_region.first;
|
||||
if (big_pages_region.first - max_prefetch_region.first > 0) {
|
||||
bitmap_tree->offset = big_page_size / PAGE_SIZE - (big_pages_region.first - max_prefetch_region.first);
|
||||
bitmap_tree->leaf_count = uvm_va_block_region_num_pages(max_prefetch_region) + bitmap_tree->offset;
|
||||
|
||||
uvm_page_mask_shift_left(&prefetch_info->bitmap_tree.pages,
|
||||
&prefetch_info->bitmap_tree.pages,
|
||||
prefetch_info->region.first);
|
||||
UVM_ASSERT(bitmap_tree->offset < big_page_size / PAGE_SIZE);
|
||||
UVM_ASSERT(bitmap_tree->leaf_count <= PAGES_PER_UVM_VA_BLOCK);
|
||||
|
||||
uvm_page_mask_shift_left(&bitmap_tree->pages, &bitmap_tree->pages, bitmap_tree->offset);
|
||||
}
|
||||
else {
|
||||
prefetch_info->region.first = 0;
|
||||
bitmap_tree->offset = 0;
|
||||
bitmap_tree->leaf_count = uvm_va_block_region_num_pages(max_prefetch_region);
|
||||
}
|
||||
|
||||
prefetch_info->region.outer = prefetch_info->region.first + uvm_va_block_num_cpu_pages(va_block);
|
||||
bitmap_tree->level_count = ilog2(roundup_pow_of_two(bitmap_tree->leaf_count)) + 1;
|
||||
|
||||
thrashing_pages = uvm_perf_thrashing_get_thrashing_pages(va_block);
|
||||
|
||||
// Assume big pages by default. Prefetch the rest of 4KB subregions within the big page
|
||||
// region unless there is thrashing.
|
||||
grow_fault_granularity(va_block, prefetch_info, faulted_pages, thrashing_pages);
|
||||
// Assume big pages by default. Prefetch the rest of 4KB subregions within
|
||||
// the big page region unless there is thrashing.
|
||||
grow_fault_granularity(bitmap_tree,
|
||||
big_page_size,
|
||||
big_pages_region,
|
||||
max_prefetch_region,
|
||||
faulted_pages,
|
||||
thrashing_pages);
|
||||
|
||||
// Do not compute prefetch regions with faults on pages that are thrashing
|
||||
if (thrashing_pages)
|
||||
uvm_page_mask_andnot(&prefetch_info->migrate_pages, faulted_pages, thrashing_pages);
|
||||
uvm_page_mask_andnot(&va_block_context->scratch_page_mask, faulted_pages, thrashing_pages);
|
||||
else
|
||||
uvm_page_mask_copy(&prefetch_info->migrate_pages, faulted_pages);
|
||||
uvm_page_mask_copy(&va_block_context->scratch_page_mask, faulted_pages);
|
||||
|
||||
// Update the tree using the migration mask to compute the pages to prefetch
|
||||
uvm_page_mask_zero(&prefetch_info->prefetch_pages);
|
||||
for_each_va_block_page_in_region_mask(page_index, &prefetch_info->migrate_pages, region) {
|
||||
uvm_va_block_region_t prefetch_region = compute_prefetch_region(page_index + prefetch_info->region.first,
|
||||
prefetch_info);
|
||||
uvm_page_mask_region_fill(&prefetch_info->prefetch_pages, prefetch_region);
|
||||
// Update the tree using the scratch mask to compute the pages to prefetch
|
||||
for_each_va_block_page_in_region_mask(page_index, &va_block_context->scratch_page_mask, faulted_region) {
|
||||
uvm_va_block_region_t region = compute_prefetch_region(page_index, bitmap_tree, max_prefetch_region);
|
||||
|
||||
uvm_page_mask_region_fill(prefetch_pages, region);
|
||||
|
||||
// Early out if we have already prefetched until the end of the VA block
|
||||
if (prefetch_region.outer == prefetch_info->region.outer)
|
||||
if (region.outer == max_prefetch_region.outer)
|
||||
break;
|
||||
}
|
||||
|
||||
// Adjust prefetching page mask
|
||||
if (prefetch_info->region.first > 0) {
|
||||
uvm_page_mask_shift_right(&prefetch_info->prefetch_pages,
|
||||
&prefetch_info->prefetch_pages,
|
||||
prefetch_info->region.first);
|
||||
}
|
||||
|
||||
done:
|
||||
// Do not prefetch pages that are going to be migrated/populated due to a
|
||||
// fault
|
||||
uvm_page_mask_andnot(&prefetch_info->prefetch_pages,
|
||||
&prefetch_info->prefetch_pages,
|
||||
faulted_pages);
|
||||
uvm_page_mask_andnot(prefetch_pages, prefetch_pages, faulted_pages);
|
||||
|
||||
// TODO: Bug 1765432: prefetching pages that are already mapped on the CPU
|
||||
// would trigger a remap, which may cause a large overhead. Therefore,
|
||||
// exclude them from the mask.
|
||||
if (UVM_ID_IS_CPU(new_residency)) {
|
||||
// For HMM, we don't know what pages are mapped by the CPU unless we try to
|
||||
// migrate them. Prefetch pages will only be opportunistically migrated.
|
||||
if (UVM_ID_IS_CPU(new_residency) && !uvm_va_block_is_hmm(va_block)) {
|
||||
uvm_page_mask_and(&va_block_context->scratch_page_mask,
|
||||
resident_mask,
|
||||
&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
|
||||
uvm_page_mask_andnot(&prefetch_info->prefetch_pages,
|
||||
&prefetch_info->prefetch_pages,
|
||||
&va_block_context->scratch_page_mask);
|
||||
uvm_page_mask_andnot(prefetch_pages, prefetch_pages, &va_block_context->scratch_page_mask);
|
||||
}
|
||||
|
||||
// Avoid prefetching pages that are thrashing
|
||||
if (thrashing_pages) {
|
||||
uvm_page_mask_andnot(&prefetch_info->prefetch_pages,
|
||||
&prefetch_info->prefetch_pages,
|
||||
thrashing_pages);
|
||||
}
|
||||
if (thrashing_pages)
|
||||
uvm_page_mask_andnot(prefetch_pages, prefetch_pages, thrashing_pages);
|
||||
|
||||
prefetch_info->fault_migrations_to_last_proc += uvm_page_mask_region_weight(faulted_pages, region);
|
||||
prefetch_info->pending_prefetch_pages = uvm_page_mask_weight(&prefetch_info->prefetch_pages);
|
||||
va_block->prefetch_info.fault_migrations_to_last_proc += uvm_page_mask_region_weight(faulted_pages, faulted_region);
|
||||
|
||||
return uvm_page_mask_weight(prefetch_pages);
|
||||
}
|
||||
|
||||
uvm_perf_prefetch_hint_t uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
|
||||
const uvm_page_mask_t *new_residency_mask)
|
||||
void uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
uvm_processor_id_t new_residency,
|
||||
const uvm_page_mask_t *faulted_pages,
|
||||
uvm_va_block_region_t faulted_region,
|
||||
uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
uvm_perf_prefetch_hint_t *out_hint)
|
||||
{
|
||||
uvm_perf_prefetch_hint_t ret = UVM_PERF_PREFETCH_HINT_NONE();
|
||||
block_prefetch_info_t *prefetch_info;
|
||||
uvm_va_policy_t *policy = va_block_context->policy;
|
||||
uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
|
||||
uvm_page_mask_t *prefetch_pages = &out_hint->prefetch_pages_mask;
|
||||
NvU32 pending_prefetch_pages;
|
||||
|
||||
uvm_assert_rwsem_locked(&va_space->lock);
|
||||
uvm_assert_mutex_locked(&va_block->lock);
|
||||
UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, faulted_region));
|
||||
UVM_ASSERT(uvm_hmm_va_block_context_vma_is_valid(va_block, va_block_context, faulted_region));
|
||||
|
||||
out_hint->residency = UVM_ID_INVALID;
|
||||
|
||||
if (!g_uvm_perf_prefetch_enable)
|
||||
return ret;
|
||||
return;
|
||||
|
||||
if (!va_space->test.page_prefetch_enabled)
|
||||
return ret;
|
||||
return;
|
||||
|
||||
prefetch_info = prefetch_info_get(va_block);
|
||||
if (!prefetch_info)
|
||||
return ret;
|
||||
pending_prefetch_pages = uvm_perf_prefetch_prenotify_fault_migrations(va_block,
|
||||
va_block_context,
|
||||
new_residency,
|
||||
faulted_pages,
|
||||
faulted_region,
|
||||
prefetch_pages,
|
||||
bitmap_tree);
|
||||
|
||||
if (prefetch_info->fault_migrations_to_last_proc >= g_uvm_perf_prefetch_min_faults &&
|
||||
prefetch_info->pending_prefetch_pages > 0) {
|
||||
if (va_block->prefetch_info.fault_migrations_to_last_proc >= g_uvm_perf_prefetch_min_faults &&
|
||||
pending_prefetch_pages > 0) {
|
||||
bool changed = false;
|
||||
uvm_range_group_range_t *rgr;
|
||||
|
||||
@@ -402,62 +410,19 @@ uvm_perf_prefetch_hint_t uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
|
||||
max(rgr->node.start, va_block->start),
|
||||
min(rgr->node.end, va_block->end));
|
||||
|
||||
if (uvm_page_mask_region_empty(new_residency_mask, region) &&
|
||||
!uvm_page_mask_region_empty(&prefetch_info->prefetch_pages, region)) {
|
||||
uvm_page_mask_region_clear(&prefetch_info->prefetch_pages, region);
|
||||
if (uvm_page_mask_region_empty(faulted_pages, region) &&
|
||||
!uvm_page_mask_region_empty(prefetch_pages, region)) {
|
||||
uvm_page_mask_region_clear(prefetch_pages, region);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (changed)
|
||||
prefetch_info->pending_prefetch_pages = uvm_page_mask_weight(&prefetch_info->prefetch_pages);
|
||||
pending_prefetch_pages = uvm_page_mask_weight(prefetch_pages);
|
||||
|
||||
if (prefetch_info->pending_prefetch_pages > 0) {
|
||||
ret.residency = prefetch_info->last_migration_proc_id;
|
||||
ret.prefetch_pages_mask = &prefetch_info->prefetch_pages;
|
||||
}
|
||||
if (pending_prefetch_pages > 0)
|
||||
out_hint->residency = va_block->prefetch_info.last_migration_proc_id;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void prefetch_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
|
||||
{
|
||||
uvm_va_block_t *va_block;
|
||||
|
||||
UVM_ASSERT(g_uvm_perf_prefetch_enable);
|
||||
|
||||
UVM_ASSERT(event_id == UVM_PERF_EVENT_BLOCK_DESTROY ||
|
||||
event_id == UVM_PERF_EVENT_MODULE_UNLOAD ||
|
||||
event_id == UVM_PERF_EVENT_BLOCK_SHRINK);
|
||||
|
||||
if (event_id == UVM_PERF_EVENT_BLOCK_DESTROY)
|
||||
va_block = event_data->block_destroy.block;
|
||||
else if (event_id == UVM_PERF_EVENT_BLOCK_SHRINK)
|
||||
va_block = event_data->block_shrink.block;
|
||||
else
|
||||
va_block = event_data->module_unload.block;
|
||||
|
||||
if (!va_block)
|
||||
return;
|
||||
|
||||
prefetch_info_destroy(va_block);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_perf_prefetch_load(uvm_va_space_t *va_space)
|
||||
{
|
||||
if (!g_uvm_perf_prefetch_enable)
|
||||
return NV_OK;
|
||||
|
||||
return uvm_perf_module_load(&g_module_prefetch, va_space);
|
||||
}
|
||||
|
||||
void uvm_perf_prefetch_unload(uvm_va_space_t *va_space)
|
||||
{
|
||||
if (!g_uvm_perf_prefetch_enable)
|
||||
return;
|
||||
|
||||
uvm_perf_module_unload(&g_module_prefetch, va_space);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_perf_prefetch_init()
|
||||
@@ -467,13 +432,6 @@ NV_STATUS uvm_perf_prefetch_init()
|
||||
if (!g_uvm_perf_prefetch_enable)
|
||||
return NV_OK;
|
||||
|
||||
uvm_perf_module_init("perf_prefetch", UVM_PERF_MODULE_TYPE_PREFETCH, g_callbacks_prefetch,
|
||||
ARRAY_SIZE(g_callbacks_prefetch), &g_module_prefetch);
|
||||
|
||||
g_prefetch_info_cache = NV_KMEM_CACHE_CREATE("block_prefetch_info_t", block_prefetch_info_t);
|
||||
if (!g_prefetch_info_cache)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
if (uvm_perf_prefetch_threshold <= 100) {
|
||||
g_uvm_perf_prefetch_threshold = uvm_perf_prefetch_threshold;
|
||||
}
|
||||
@@ -498,14 +456,6 @@ NV_STATUS uvm_perf_prefetch_init()
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
void uvm_perf_prefetch_exit()
|
||||
{
|
||||
if (!g_uvm_perf_prefetch_enable)
|
||||
return;
|
||||
|
||||
kmem_cache_destroy_safe(&g_prefetch_info_cache);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_test_set_page_prefetch_policy(UVM_TEST_SET_PAGE_PREFETCH_POLICY_PARAMS *params, struct file *filp)
|
||||
{
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
||||
|
||||
Reference in New Issue
Block a user