535.129.03

This commit is contained in:
Bernhard Stoeckner
2023-10-31 14:22:16 +01:00
parent f59818b751
commit e573018659
163 changed files with 86017 additions and 84520 deletions

View File

@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
EXTRA_CFLAGS += -I$(src)
EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.113.01\"
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.129.03\"
ifneq ($(SYSSRCHOST1X),)
EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@@ -123,6 +123,9 @@ ifneq ($(wildcard /proc/sgi_uv),)
EXTRA_CFLAGS += -DNV_CONFIG_X86_UV
endif
ifdef VGX_FORCE_VFIO_PCI_CORE
EXTRA_CFLAGS += -DNV_VGPU_FORCE_VFIO_PCI_CORE
endif
#
# The conftest.sh script tests various aspects of the target kernel.

View File

@@ -4468,6 +4468,24 @@ compile_test() {
compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE" "" "types"
;;
mmu_notifier_ops_arch_invalidate_secondary_tlbs)
#
# Determine if the mmu_notifier_ops struct has the
# 'arch_invalidate_secondary_tlbs' member.
#
# struct mmu_notifier_ops.invalidate_range was renamed to
# arch_invalidate_secondary_tlbs by commit 1af5a8109904
# ("mmu_notifiers: rename invalidate_range notifier") due to be
# added in v6.6
CODE="
#include <linux/mmu_notifier.h>
int conftest_mmu_notifier_ops_arch_invalidate_secondary_tlbs(void) {
return offsetof(struct mmu_notifier_ops, arch_invalidate_secondary_tlbs);
}"
compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS" "" "types"
;;
drm_format_num_planes)
#
# Determine if drm_format_num_planes() function is present.
@@ -6681,18 +6699,9 @@ case "$5" in
VFIO_PCI_CORE_PRESENT=1
fi
# When this sanity check is run via nvidia-installer, it sets ARCH as aarch64.
# But, when it is run via Kbuild, ARCH is set as arm64
if [ "$ARCH" = "aarch64" ]; then
ARCH="arm64"
fi
if [ "$VFIO_IOMMU_PRESENT" != "0" ] && [ "$KVM_PRESENT" != "0" ] ; then
# On x86_64, vGPU requires MDEV framework to be present.
# On aarch64, vGPU requires MDEV or vfio-pci-core framework to be present.
if ([ "$ARCH" = "arm64" ] && ([ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ])) ||
([ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" != "0" ];) then
# vGPU requires either MDEV or vfio-pci-core framework to be present.
if [ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ]; then
exit 0
fi
fi
@@ -6703,14 +6712,10 @@ case "$5" in
echo "CONFIG_VFIO_IOMMU_TYPE1";
fi
if [ "$ARCH" = "arm64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
if [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
echo "either CONFIG_VFIO_MDEV or CONFIG_VFIO_PCI_CORE";
fi
if [ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ]; then
echo "CONFIG_VFIO_MDEV";
fi
if [ "$KVM_PRESENT" = "0" ]; then
echo "CONFIG_KVM";
fi

View File

@@ -53,7 +53,13 @@ static int peerdirect_support = NV_MEM_PEERDIRECT_SUPPORT_DEFAULT;
module_param(peerdirect_support, int, S_IRUGO);
MODULE_PARM_DESC(peerdirect_support, "Set level of support for Peer-direct, 0 [default] or 1 [legacy, for example MLNX_OFED 4.9 LTS]");
#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d " FMT, __FUNCTION__, __LINE__, ## ARGS)
#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d ERROR " FMT, __FUNCTION__, __LINE__, ## ARGS)
#ifdef NV_MEM_DEBUG
#define peer_trace(FMT, ARGS...) printk(KERN_DEBUG "nvidia-peermem" " %s:%d TRACE " FMT, __FUNCTION__, __LINE__, ## ARGS)
#else
#define peer_trace(FMT, ARGS...) do {} while (0)
#endif
#if defined(NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)
@@ -74,7 +80,10 @@ invalidate_peer_memory mem_invalidate_callback;
static void *reg_handle = NULL;
static void *reg_handle_nc = NULL;
#define NV_MEM_CONTEXT_MAGIC ((u64)0xF1F4F1D0FEF0DAD0ULL)
struct nv_mem_context {
u64 pad1;
struct nvidia_p2p_page_table *page_table;
struct nvidia_p2p_dma_mapping *dma_mapping;
u64 core_context;
@@ -86,8 +95,22 @@ struct nv_mem_context {
struct task_struct *callback_task;
int sg_allocated;
struct sg_table sg_head;
u64 pad2;
};
#define NV_MEM_CONTEXT_CHECK_OK(MC) ({ \
struct nv_mem_context *mc = (MC); \
int rc = ((0 != mc) && \
(READ_ONCE(mc->pad1) == NV_MEM_CONTEXT_MAGIC) && \
(READ_ONCE(mc->pad2) == NV_MEM_CONTEXT_MAGIC)); \
if (!rc) { \
peer_trace("invalid nv_mem_context=%px pad1=%016llx pad2=%016llx\n", \
mc, \
mc?mc->pad1:0, \
mc?mc->pad2:0); \
} \
rc; \
})
static void nv_get_p2p_free_callback(void *data)
{
@@ -97,8 +120,9 @@ static void nv_get_p2p_free_callback(void *data)
struct nvidia_p2p_dma_mapping *dma_mapping = NULL;
__module_get(THIS_MODULE);
if (!nv_mem_context) {
peer_err("nv_get_p2p_free_callback -- invalid nv_mem_context\n");
if (!NV_MEM_CONTEXT_CHECK_OK(nv_mem_context)) {
peer_err("detected invalid context, skipping further processing\n");
goto out;
}
@@ -169,9 +193,11 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
/* Error case handled as not mine */
return 0;
nv_mem_context->pad1 = NV_MEM_CONTEXT_MAGIC;
nv_mem_context->page_virt_start = addr & GPU_PAGE_MASK;
nv_mem_context->page_virt_end = (addr + size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
nv_mem_context->mapped_size = nv_mem_context->page_virt_end - nv_mem_context->page_virt_start;
nv_mem_context->pad2 = NV_MEM_CONTEXT_MAGIC;
ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
&nv_mem_context->page_table, nv_mem_dummy_callback, nv_mem_context);
@@ -195,6 +221,7 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
return 1;
err:
memset(nv_mem_context, 0, sizeof(*nv_mem_context));
kfree(nv_mem_context);
/* Error case handled as not mine */
@@ -342,6 +369,7 @@ static void nv_mem_release(void *context)
sg_free_table(&nv_mem_context->sg_head);
nv_mem_context->sg_allocated = 0;
}
memset(nv_mem_context, 0, sizeof(*nv_mem_context));
kfree(nv_mem_context);
module_put(THIS_MODULE);
return;

View File

@@ -99,6 +99,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock

View File

@@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
static void uvm_vm_close_managed(struct vm_area_struct *vma)
{
uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
uvm_processor_id_t gpu_id;
bool make_zombie = false;
if (current->mm != NULL)
@@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)
uvm_destroy_vma_managed(vma, make_zombie);
// Notify GPU address spaces that the fault buffer needs to be flushed to
// avoid finding stale entries that can be attributed to new VA ranges
// reallocated at the same address.
for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
}
uvm_va_space_up_write(va_space);
if (current->mm != NULL)

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2021 NVIDIA Corporation
Copyright (c) 2021-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -94,4 +94,6 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->map_remap_larger_page_promotion = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -101,4 +101,6 @@ void uvm_hal_ampere_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->map_remap_larger_page_promotion = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -107,10 +107,10 @@ static NV_STATUS service_ats_faults(uvm_gpu_va_space_t *gpu_va_space,
return status;
}
static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
NvU64 addr,
size_t size,
uvm_fault_client_type_t client_type)
static void flush_tlb_va_region(uvm_gpu_va_space_t *gpu_va_space,
NvU64 addr,
size_t size,
uvm_fault_client_type_t client_type)
{
uvm_ats_fault_invalidate_t *ats_invalidate;
@@ -119,12 +119,12 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
else
ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.non_replayable.ats_invalidate;
if (!ats_invalidate->write_faults_in_batch) {
uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
ats_invalidate->write_faults_in_batch = true;
if (!ats_invalidate->tlb_batch_pending) {
uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->tlb_batch);
ats_invalidate->tlb_batch_pending = true;
}
uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
uvm_tlb_batch_invalidate(&ats_invalidate->tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
}
static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,
@@ -497,6 +497,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
if (vma->vm_flags & VM_WRITE) {
uvm_page_mask_region_fill(faults_serviced_mask, subregion);
uvm_ats_smmu_invalidate_tlbs(gpu_va_space, start, length);
// The Linux kernel never invalidates TLB entries on mapping
// permission upgrade. This is a problem if the GPU has cached
@@ -507,7 +508,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
// infinite loop because we just forward the fault to the Linux
// kernel and it will see that the permissions in the page table are
// correct. Therefore, we flush TLB entries on ATS write faults.
flush_tlb_write_faults(gpu_va_space, start, length, client_type);
flush_tlb_va_region(gpu_va_space, start, length, client_type);
}
else {
uvm_page_mask_region_fill(reads_serviced_mask, subregion);
@@ -530,6 +531,15 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
return status;
uvm_page_mask_region_fill(faults_serviced_mask, subregion);
// Similarly to permission upgrade scenario, discussed above, GPU
// will not re-fetch the entry if the PTE is invalid and page size
// is 4K. To avoid infinite faulting loop, invalidate TLB for every
// new translation written explicitly like in the case of permission
// upgrade.
if (PAGE_SIZE == UVM_PAGE_SIZE_4K)
flush_tlb_va_region(gpu_va_space, start, length, client_type);
}
return status;
@@ -564,7 +574,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
NV_STATUS status;
uvm_push_t push;
if (!ats_invalidate->write_faults_in_batch)
if (!ats_invalidate->tlb_batch_pending)
return NV_OK;
UVM_ASSERT(gpu_va_space);
@@ -576,7 +586,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
"Invalidate ATS entries");
if (status == NV_OK) {
uvm_tlb_batch_end(&ats_invalidate->write_faults_tlb_batch, &push, UVM_MEMBAR_NONE);
uvm_tlb_batch_end(&ats_invalidate->tlb_batch, &push, UVM_MEMBAR_NONE);
uvm_push_end(&push);
// Add this push to the GPU's tracker so that fault replays/clears can
@@ -584,8 +594,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
status = uvm_tracker_add_push_safe(out_tracker, &push);
}
ats_invalidate->write_faults_in_batch = false;
ats_invalidate->tlb_batch_pending = false;
return status;
}

View File

@@ -52,7 +52,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
bool uvm_ats_check_in_gmmu_region(uvm_va_space_t *va_space, NvU64 address, uvm_va_range_t *next);
// This function performs pending TLB invalidations for ATS and clears the
// ats_invalidate->write_faults_in_batch flag
// ats_invalidate->tlb_batch_pending flag
NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
uvm_ats_fault_invalidate_t *ats_invalidate,
uvm_tracker_t *out_tracker);

View File

@@ -29,8 +29,13 @@
#include "uvm_va_space.h"
#include "uvm_va_space_mm.h"
#include <asm/io.h>
#include <linux/log2.h>
#include <linux/iommu.h>
#include <linux/mm_types.h>
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/mmu_context.h>
// linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
// reference required for the iommu_sva_bind_device() call. This header is not
@@ -46,17 +51,276 @@
#define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
#endif
// Type to represent a 128-bit SMMU command queue command.
struct smmu_cmd {
NvU64 low;
NvU64 high;
};
// Base address of SMMU CMDQ-V for GSMMU0.
#define SMMU_CMDQV_BASE_ADDR(smmu_base) (smmu_base + 0x200000)
#define SMMU_CMDQV_BASE_LEN 0x00830000
// CMDQV configuration is done by firmware but we check status here.
#define SMMU_CMDQV_CONFIG 0x0
#define SMMU_CMDQV_CONFIG_CMDQV_EN BIT(0)
// Used to map a particular VCMDQ to a VINTF.
#define SMMU_CMDQV_CMDQ_ALLOC_MAP(vcmdq_id) (0x200 + 0x4 * (vcmdq_id))
#define SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC BIT(0)
// Shift for the field containing the index of the virtual interface
// owning the VCMDQ.
#define SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT 15
// Base address for the VINTF registers.
#define SMMU_VINTF_BASE_ADDR(cmdqv_base_addr, vintf_id) (cmdqv_base_addr + 0x1000 + 0x100 * (vintf_id))
// Virtual interface (VINTF) configuration registers. The WAR only
// works on baremetal so we need to configure ourselves as the
// hypervisor owner.
#define SMMU_VINTF_CONFIG 0x0
#define SMMU_VINTF_CONFIG_ENABLE BIT(0)
#define SMMU_VINTF_CONFIG_HYP_OWN BIT(17)
#define SMMU_VINTF_STATUS 0x0
#define SMMU_VINTF_STATUS_ENABLED BIT(0)
// Caclulates the base address for a particular VCMDQ instance.
#define SMMU_VCMDQ_BASE_ADDR(cmdqv_base_addr, vcmdq_id) (cmdqv_base_addr + 0x10000 + 0x80 * (vcmdq_id))
// SMMU command queue consumer index register. Updated by SMMU
// when commands are consumed.
#define SMMU_VCMDQ_CONS 0x0
// SMMU command queue producer index register. Updated by UVM when
// commands are added to the queue.
#define SMMU_VCMDQ_PROD 0x4
// Configuration register used to enable a VCMDQ.
#define SMMU_VCMDQ_CONFIG 0x8
#define SMMU_VCMDQ_CONFIG_ENABLE BIT(0)
// Status register used to check the VCMDQ is enabled.
#define SMMU_VCMDQ_STATUS 0xc
#define SMMU_VCMDQ_STATUS_ENABLED BIT(0)
// Base address offset for the VCMDQ registers.
#define SMMU_VCMDQ_CMDQ_BASE 0x10000
// Size of the command queue. Each command is 16 bytes and we can't
// have a command queue greater than one page in size.
#define SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE (PAGE_SHIFT - ilog2(sizeof(struct smmu_cmd)))
#define SMMU_VCMDQ_CMDQ_ENTRIES (1UL << SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE)
// We always use VINTF63 for the WAR
#define VINTF 63
static void smmu_vintf_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
{
iowrite32(val, SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
}
static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)
{
return ioread32(SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
}
// We always use VCMDQ127 for the WAR
#define VCMDQ 127
void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
{
iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
{
return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
static void smmu_vcmdq_write64(void __iomem *smmu_cmdqv_base, int reg, NvU64 val)
{
iowrite64(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
// TLB invalidates on read-only to read-write upgrades
static NV_STATUS uvm_ats_smmu_war_init(uvm_parent_gpu_t *parent_gpu)
{
uvm_spin_loop_t spin;
NV_STATUS status;
unsigned long cmdqv_config;
void __iomem *smmu_cmdqv_base;
struct acpi_iort_node *node;
struct acpi_iort_smmu_v3 *iort_smmu;
node = *(struct acpi_iort_node **) dev_get_platdata(parent_gpu->pci_dev->dev.iommu->iommu_dev->dev->parent);
iort_smmu = (struct acpi_iort_smmu_v3 *) node->node_data;
smmu_cmdqv_base = ioremap(SMMU_CMDQV_BASE_ADDR(iort_smmu->base_address), SMMU_CMDQV_BASE_LEN);
if (!smmu_cmdqv_base)
return NV_ERR_NO_MEMORY;
parent_gpu->smmu_war.smmu_cmdqv_base = smmu_cmdqv_base;
cmdqv_config = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CONFIG);
if (!(cmdqv_config & SMMU_CMDQV_CONFIG_CMDQV_EN)) {
status = NV_ERR_OBJECT_NOT_FOUND;
goto out;
}
// Allocate SMMU CMDQ pages for WAR
parent_gpu->smmu_war.smmu_cmdq = alloc_page(NV_UVM_GFP_FLAGS | __GFP_ZERO);
if (!parent_gpu->smmu_war.smmu_cmdq) {
status = NV_ERR_NO_MEMORY;
goto out;
}
// Initialise VINTF for the WAR
smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, SMMU_VINTF_CONFIG_ENABLE | SMMU_VINTF_CONFIG_HYP_OWN);
UVM_SPIN_WHILE(!(smmu_vintf_read32(smmu_cmdqv_base, SMMU_VINTF_STATUS) & SMMU_VINTF_STATUS_ENABLED), &spin);
// Allocate VCMDQ to VINTF
iowrite32((VINTF << SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT) | SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC,
smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
smmu_vcmdq_write64(smmu_cmdqv_base, SMMU_VCMDQ_CMDQ_BASE,
page_to_phys(parent_gpu->smmu_war.smmu_cmdq) | SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONS, 0);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_PROD, 0);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, SMMU_VCMDQ_CONFIG_ENABLE);
UVM_SPIN_WHILE(!(smmu_vcmdq_read32(smmu_cmdqv_base, SMMU_VCMDQ_STATUS) & SMMU_VCMDQ_STATUS_ENABLED), &spin);
uvm_mutex_init(&parent_gpu->smmu_war.smmu_lock, UVM_LOCK_ORDER_LEAF);
parent_gpu->smmu_war.smmu_prod = 0;
parent_gpu->smmu_war.smmu_cons = 0;
return NV_OK;
out:
iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
parent_gpu->smmu_war.smmu_cmdqv_base = NULL;
return status;
}
static void uvm_ats_smmu_war_deinit(uvm_parent_gpu_t *parent_gpu)
{
void __iomem *smmu_cmdqv_base = parent_gpu->smmu_war.smmu_cmdqv_base;
NvU32 cmdq_alloc_map;
if (parent_gpu->smmu_war.smmu_cmdqv_base) {
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, 0);
cmdq_alloc_map = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
iowrite32(cmdq_alloc_map & SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC, smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, 0);
}
if (parent_gpu->smmu_war.smmu_cmdq)
__free_page(parent_gpu->smmu_war.smmu_cmdq);
if (parent_gpu->smmu_war.smmu_cmdqv_base)
iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
}
// The SMMU on ARM64 can run under different translation regimes depending on
// what features the OS and CPU variant support. The CPU for GH180 supports
// virtualisation extensions and starts the kernel at EL2 meaning SMMU operates
// under the NS-EL2-E2H translation regime. Therefore we need to use the
// TLBI_EL2_* commands which invalidate TLB entries created under this
// translation regime.
#define CMDQ_OP_TLBI_EL2_ASID 0x21;
#define CMDQ_OP_TLBI_EL2_VA 0x22;
#define CMDQ_OP_CMD_SYNC 0x46
// Use the same maximum as used for MAX_TLBI_OPS in the upstream
// kernel.
#define UVM_MAX_TLBI_OPS (1UL << (PAGE_SHIFT - 3))
#if UVM_ATS_SMMU_WAR_REQUIRED()
void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
uvm_parent_gpu_t *parent_gpu = gpu_va_space->gpu->parent;
struct {
NvU64 low;
NvU64 high;
} *vcmdq;
unsigned long vcmdq_prod;
NvU64 end;
uvm_spin_loop_t spin;
NvU16 asid;
if (!parent_gpu->smmu_war.smmu_cmdqv_base)
return;
asid = arm64_mm_context_get(mm);
vcmdq = kmap(parent_gpu->smmu_war.smmu_cmdq);
uvm_mutex_lock(&parent_gpu->smmu_war.smmu_lock);
vcmdq_prod = parent_gpu->smmu_war.smmu_prod;
// Our queue management is very simple. The mutex prevents multiple
// producers writing to the queue and all our commands require waiting for
// the queue to drain so we know it's empty. If we can't fit enough commands
// in the queue we just invalidate the whole ASID.
//
// The command queue is a cirular buffer with the MSB representing a wrap
// bit that must toggle on each wrap. See the SMMU architecture
// specification for more details.
//
// SMMU_VCMDQ_CMDQ_ENTRIES - 1 because we need to leave space for the
// CMD_SYNC.
if ((size >> PAGE_SHIFT) > min(UVM_MAX_TLBI_OPS, SMMU_VCMDQ_CMDQ_ENTRIES - 1)) {
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_ASID;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0;
vcmdq_prod++;
}
else {
for (end = addr + size; addr < end; addr += PAGE_SIZE) {
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_VA;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = addr & ~((1UL << 12) - 1);
vcmdq_prod++;
}
}
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_CMD_SYNC;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0x0;
vcmdq_prod++;
// MSB is the wrap bit
vcmdq_prod &= (1UL << (SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 1)) - 1;
parent_gpu->smmu_war.smmu_prod = vcmdq_prod;
smmu_vcmdq_write32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_PROD, parent_gpu->smmu_war.smmu_prod);
UVM_SPIN_WHILE(
(smmu_vcmdq_read32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_CONS) & GENMASK(19, 0)) != vcmdq_prod,
&spin);
uvm_mutex_unlock(&parent_gpu->smmu_war.smmu_lock);
kunmap(parent_gpu->smmu_war.smmu_cmdq);
arm64_mm_context_put(mm);
}
#endif
NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
{
int ret;
ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
if (ret)
return errno_to_nv_status(ret);
return errno_to_nv_status(ret);
if (UVM_ATS_SMMU_WAR_REQUIRED())
return uvm_ats_smmu_war_init(parent_gpu);
else
return NV_OK;
}
void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
{
if (UVM_ATS_SMMU_WAR_REQUIRED())
uvm_ats_smmu_war_deinit(parent_gpu);
iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
}

View File

@@ -53,6 +53,17 @@
#define UVM_ATS_SVA_SUPPORTED() 0
#endif
// If NV_ARCH_INVALIDATE_SECONDARY_TLBS is defined it means the upstream fix is
// in place so no need for the WAR from Bug 4130089: [GH180][r535] WAR for
// kernel not issuing SMMU TLB invalidates on read-only
#if defined(NV_ARCH_INVALIDATE_SECONDARY_TLBS)
#define UVM_ATS_SMMU_WAR_REQUIRED() 0
#elif NVCPU_IS_AARCH64
#define UVM_ATS_SMMU_WAR_REQUIRED() 1
#else
#define UVM_ATS_SMMU_WAR_REQUIRED() 0
#endif
typedef struct
{
int placeholder;
@@ -81,6 +92,17 @@ typedef struct
// LOCKING: None
void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
// TLB invalidates on read-only to read-write upgrades
#if UVM_ATS_SMMU_WAR_REQUIRED()
void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size);
#else
static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
}
#endif
#else
static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
{
@@ -111,6 +133,11 @@ typedef struct
{
}
static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
}
#endif // UVM_ATS_SVA_SUPPORTED
#endif // __UVM_ATS_SVA_H__

View File

@@ -21,8 +21,8 @@
*******************************************************************************/
#ifndef _UVM_COMMON_H
#define _UVM_COMMON_H
#ifndef __UVM_COMMON_H__
#define __UVM_COMMON_H__
#ifdef DEBUG
#define UVM_IS_DEBUG() 1
@@ -413,4 +413,40 @@ static inline void uvm_touch_page(struct page *page)
// Return true if the VMA is one used by UVM managed allocations.
bool uvm_vma_is_managed(struct vm_area_struct *vma);
#endif /* _UVM_COMMON_H */
static bool uvm_platform_uses_canonical_form_address(void)
{
if (NVCPU_IS_PPC64LE)
return false;
return true;
}
// Similar to the GPU MMU HAL num_va_bits(), it returns the CPU's num_va_bits().
static NvU32 uvm_cpu_num_va_bits(void)
{
return fls64(TASK_SIZE - 1) + 1;
}
// Return the unaddressable range in a num_va_bits-wide VA space, [first, outer)
static void uvm_get_unaddressable_range(NvU32 num_va_bits, NvU64 *first, NvU64 *outer)
{
UVM_ASSERT(num_va_bits < 64);
UVM_ASSERT(first);
UVM_ASSERT(outer);
if (uvm_platform_uses_canonical_form_address()) {
*first = 1ULL << (num_va_bits - 1);
*outer = (NvU64)((NvS64)(1ULL << 63) >> (64 - num_va_bits));
}
else {
*first = 1ULL << num_va_bits;
*outer = ~0Ull;
}
}
static void uvm_cpu_get_unaddressable_range(NvU64 *first, NvU64 *outer)
{
return uvm_get_unaddressable_range(uvm_cpu_num_va_bits(), first, outer);
}
#endif /* __UVM_COMMON_H__ */

View File

@@ -218,19 +218,12 @@ static bool gpu_supports_uvm(uvm_parent_gpu_t *parent_gpu)
return parent_gpu->rm_info.subdeviceCount == 1;
}
static bool platform_uses_canonical_form_address(void)
{
if (NVCPU_IS_PPC64LE)
return false;
return true;
}
bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
{
// Lower and upper address spaces are typically found in platforms that use
// the canonical address form.
NvU64 max_va_lower;
NvU64 min_va_upper;
NvU64 addr_end = addr + size - 1;
NvU8 gpu_addr_shift;
NvU8 cpu_addr_shift;
@@ -243,7 +236,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
UVM_ASSERT(size > 0);
gpu_addr_shift = gpu->address_space_tree.hal->num_va_bits();
cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
cpu_addr_shift = uvm_cpu_num_va_bits();
addr_shift = gpu_addr_shift;
// Pascal+ GPUs are capable of accessing kernel pointers in various modes
@@ -279,9 +272,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
// 0 +----------------+ 0 +----------------+
// On canonical form address platforms and Pascal+ GPUs.
if (platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
NvU64 min_va_upper;
if (uvm_platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
// On x86, when cpu_addr_shift > gpu_addr_shift, it means the CPU uses
// 5-level paging and the GPU is pre-Hopper. On Pascal-Ada GPUs (49b
// wide VA) we set addr_shift to match a 4-level paging x86 (48b wide).
@@ -292,15 +283,11 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
addr_shift = gpu_addr_shift;
else
addr_shift = cpu_addr_shift;
}
min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - addr_shift));
max_va_lower = 1ULL << (addr_shift - 1);
return (addr_end < max_va_lower) || (addr >= min_va_upper);
}
else {
max_va_lower = 1ULL << addr_shift;
return addr_end < max_va_lower;
}
uvm_get_unaddressable_range(addr_shift, &max_va_lower, &min_va_upper);
return (addr_end < max_va_lower) || (addr >= min_va_upper);
}
// The internal UVM VAS does not use canonical form addresses.
@@ -326,14 +313,14 @@ NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr)
NvU8 addr_shift;
NvU64 input_addr = addr;
if (platform_uses_canonical_form_address()) {
if (uvm_platform_uses_canonical_form_address()) {
// When the CPU VA width is larger than GPU's, it means that:
// On ARM: the CPU is on LVA mode and the GPU is pre-Hopper.
// On x86: the CPU uses 5-level paging and the GPU is pre-Hopper.
// We sign-extend on the 48b on ARM and on the 47b on x86 to mirror the
// behavior of CPUs with smaller (than GPU) VA widths.
gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
cpu_addr_shift = uvm_cpu_num_va_bits();
if (cpu_addr_shift > gpu_addr_shift)
addr_shift = NVCPU_IS_X86_64 ? 48 : 49;

View File

@@ -57,14 +57,16 @@
typedef struct
{
// Number of faults from this uTLB that have been fetched but have not been serviced yet
// Number of faults from this uTLB that have been fetched but have not been
// serviced yet.
NvU32 num_pending_faults;
// Whether the uTLB contains fatal faults
bool has_fatal_faults;
// We have issued a replay of type START_ACK_ALL while containing fatal faults. This puts
// the uTLB in lockdown mode and no new translations are accepted
// We have issued a replay of type START_ACK_ALL while containing fatal
// faults. This puts the uTLB in lockdown mode and no new translations are
// accepted.
bool in_lockdown;
// We have issued a cancel on this uTLB
@@ -126,8 +128,8 @@ struct uvm_service_block_context_struct
struct list_head service_context_list;
// A mask of GPUs that need to be checked for ECC errors before the CPU
// fault handler returns, but after the VA space lock has been unlocked to
// avoid the RM/UVM VA space lock deadlocks.
// fault handler returns, but after the VA space lock has been unlocked
// to avoid the RM/UVM VA space lock deadlocks.
uvm_processor_mask_t gpus_to_check_for_ecc;
// This is set to throttle page fault thrashing.
@@ -160,9 +162,9 @@ struct uvm_service_block_context_struct
struct
{
// Per-processor mask with the pages that will be resident after servicing.
// We need one mask per processor because we may coalesce faults that
// trigger migrations to different processors.
// Per-processor mask with the pages that will be resident after
// servicing. We need one mask per processor because we may coalesce
// faults that trigger migrations to different processors.
uvm_page_mask_t new_residency;
} per_processor_masks[UVM_ID_MAX_PROCESSORS];
@@ -263,7 +265,10 @@ struct uvm_fault_service_batch_context_struct
NvU32 num_coalesced_faults;
bool has_fatal_faults;
// One of the VA spaces in this batch which had fatal faults. If NULL, no
// faults were fatal. More than one VA space could have fatal faults, but we
// pick one to be the target of the cancel sequence.
uvm_va_space_t *fatal_va_space;
bool has_throttled_faults;
@@ -291,11 +296,8 @@ struct uvm_fault_service_batch_context_struct
struct uvm_ats_fault_invalidate_struct
{
// Whether the TLB batch contains any information
bool write_faults_in_batch;
// Batch of TLB entries to be invalidated
uvm_tlb_batch_t write_faults_tlb_batch;
bool tlb_batch_pending;
uvm_tlb_batch_t tlb_batch;
};
typedef struct
@@ -440,20 +442,9 @@ struct uvm_access_counter_service_batch_context_struct
NvU32 num_notifications;
// Boolean used to avoid sorting the fault batch by instance_ptr if we
// determine at fetch time that all the access counter notifications in the
// batch report the same instance_ptr
// determine at fetch time that all the access counter notifications in
// the batch report the same instance_ptr
bool is_single_instance_ptr;
// Scratch space, used to generate artificial physically addressed notifications.
// Virtual address notifications are always aligned to 64k. This means up to 16
// different physical locations could have been accessed to trigger one notification.
// The sub-granularity mask can correspond to any of them.
struct
{
uvm_processor_id_t resident_processors[16];
uvm_gpu_phys_address_t phys_addresses[16];
uvm_access_counter_buffer_entry_t phys_entry;
} scratch;
} virt;
struct
@@ -464,8 +455,8 @@ struct uvm_access_counter_service_batch_context_struct
NvU32 num_notifications;
// Boolean used to avoid sorting the fault batch by aperture if we
// determine at fetch time that all the access counter notifications in the
// batch report the same aperture
// determine at fetch time that all the access counter notifications in
// the batch report the same aperture
bool is_single_aperture;
} phys;
@@ -661,8 +652,8 @@ struct uvm_gpu_struct
struct
{
// Big page size used by the internal UVM VA space
// Notably it may be different than the big page size used by a user's VA
// space in general.
// Notably it may be different than the big page size used by a user's
// VA space in general.
NvU32 internal_size;
} big_page;
@@ -688,8 +679,8 @@ struct uvm_gpu_struct
// lazily-populated array of peer GPUs, indexed by the peer's GPU index
uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS];
// Leaf spinlock used to synchronize access to the peer_gpus table so that
// it can be safely accessed from the access counters bottom half
// Leaf spinlock used to synchronize access to the peer_gpus table so
// that it can be safely accessed from the access counters bottom half
uvm_spinlock_t peer_gpus_lock;
} peer_info;
@@ -980,6 +971,10 @@ struct uvm_parent_gpu_struct
bool plc_supported;
// If true, page_tree initialization pre-populates no_ats_ranges. It only
// affects ATS systems.
bool no_ats_range_required;
// Parameters used by the TLB batching API
struct
{
@@ -1051,14 +1046,16 @@ struct uvm_parent_gpu_struct
// Interrupt handling state and locks
uvm_isr_info_t isr;
// Fault buffer info. This is only valid if supports_replayable_faults is set to true
// Fault buffer info. This is only valid if supports_replayable_faults is
// set to true.
uvm_fault_buffer_info_t fault_buffer_info;
// PMM lazy free processing queue.
// TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
nv_kthread_q_t lazy_free_q;
// Access counter buffer info. This is only valid if supports_access_counters is set to true
// Access counter buffer info. This is only valid if
// supports_access_counters is set to true.
uvm_access_counter_buffer_info_t access_counter_buffer_info;
// Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs.
@@ -1108,7 +1105,7 @@ struct uvm_parent_gpu_struct
uvm_rb_tree_t instance_ptr_table;
uvm_spinlock_t instance_ptr_table_lock;
// This is set to true if the GPU belongs to an SLI group. Else, set to false.
// This is set to true if the GPU belongs to an SLI group.
bool sli_enabled;
struct
@@ -1135,8 +1132,8 @@ struct uvm_parent_gpu_struct
// environment, rather than using the peer-id field of the PTE (which can
// only address 8 gpus), all gpus are assigned a 47-bit physical address
// space by the fabric manager. Any physical address access to these
// physical address spaces are routed through the switch to the corresponding
// peer.
// physical address spaces are routed through the switch to the
// corresponding peer.
struct
{
bool is_nvswitch_connected;
@@ -1162,6 +1159,16 @@ struct uvm_parent_gpu_struct
NvU64 memory_window_start;
NvU64 memory_window_end;
} system_bus;
// WAR to issue ATS TLB invalidation commands ourselves.
struct
{
uvm_mutex_t smmu_lock;
struct page *smmu_cmdq;
void __iomem *smmu_cmdqv_base;
unsigned long smmu_prod;
unsigned long smmu_cons;
} smmu_war;
};
static const char *uvm_gpu_name(uvm_gpu_t *gpu)
@@ -1351,7 +1358,8 @@ void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
// They must not be the same gpu.
uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);
// Get the processor id accessible by the given GPU for the given physical address
// Get the processor id accessible by the given GPU for the given physical
// address.
uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);
// Get the P2P capabilities between the gpus with the given indexes
@@ -1448,9 +1456,9 @@ NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);
// Check for ECC errors without calling into RM
//
// Calling into RM is problematic in many places, this check is always safe to do.
// Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error and
// it's required to call uvm_gpu_check_ecc_error() to be sure.
// Calling into RM is problematic in many places, this check is always safe to
// do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error
// and it's required to call uvm_gpu_check_ecc_error() to be sure.
NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);
// Map size bytes of contiguous sysmem on the GPU for physical access
@@ -1507,6 +1515,8 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
// The GPU must be initialized before calling this function.
bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
bool uvm_platform_uses_canonical_form_address(void);
// Returns addr's canonical form for host systems that use canonical form
// addresses.
NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
@@ -1553,8 +1563,9 @@ uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu);
// Debug print of GPU properties
void uvm_gpu_print(uvm_gpu_t *gpu);
// Add the given instance pointer -> user_channel mapping to this GPU. The bottom
// half GPU page fault handler uses this to look up the VA space for GPU faults.
// Add the given instance pointer -> user_channel mapping to this GPU. The
// bottom half GPU page fault handler uses this to look up the VA space for GPU
// faults.
NV_STATUS uvm_gpu_add_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);
void uvm_gpu_remove_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);

View File

@@ -33,17 +33,17 @@
#include "uvm_va_space_mm.h"
#include "uvm_pmm_sysmem.h"
#include "uvm_perf_module.h"
#include "uvm_ats_ibm.h"
#define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_MIN 1
#define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT 256
#define UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT "2m"
#define UVM_PERF_ACCESS_COUNTER_GRANULARITY UVM_ACCESS_COUNTER_GRANULARITY_2M
#define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MIN 1
#define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MAX ((1 << 16) - 1)
#define UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT 256
#define UVM_ACCESS_COUNTER_ACTION_NOTIFY 0x1
#define UVM_ACCESS_COUNTER_ACTION_CLEAR 0x2
#define UVM_ACCESS_COUNTER_ON_MANAGED 0x4
#define UVM_ACCESS_COUNTER_ACTION_CLEAR 0x1
#define UVM_ACCESS_COUNTER_PHYS_ON_MANAGED 0x2
// Each page in a tracked physical range may belong to a different VA Block. We
// preallocate an array of reverse map translations. However, access counter
@@ -54,12 +54,6 @@
#define UVM_MAX_TRANSLATION_SIZE (2 * 1024 * 1024ULL)
#define UVM_SUB_GRANULARITY_REGIONS 32
// The GPU offers the following tracking granularities: 64K, 2M, 16M, 16G
//
// Use the largest granularity to minimize the number of access counter
// notifications. This is fine because we simply drop the notifications during
// normal operation, and tests override these values.
static UVM_ACCESS_COUNTER_GRANULARITY g_uvm_access_counter_granularity;
static unsigned g_uvm_access_counter_threshold;
// Per-VA space access counters information
@@ -87,7 +81,6 @@ static int uvm_perf_access_counter_momc_migration_enable = -1;
static unsigned uvm_perf_access_counter_batch_count = UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT;
// See module param documentation below
static char *uvm_perf_access_counter_granularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT;
static unsigned uvm_perf_access_counter_threshold = UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT;
// Module parameters for the tunables
@@ -100,10 +93,6 @@ MODULE_PARM_DESC(uvm_perf_access_counter_momc_migration_enable,
"Whether MOMC access counters will trigger migrations."
"Valid values: <= -1 (default policy), 0 (off), >= 1 (on)");
module_param(uvm_perf_access_counter_batch_count, uint, S_IRUGO);
module_param(uvm_perf_access_counter_granularity, charp, S_IRUGO);
MODULE_PARM_DESC(uvm_perf_access_counter_granularity,
"Size of the physical memory region tracked by each counter. Valid values as"
"of Volta: 64k, 2m, 16m, 16g");
module_param(uvm_perf_access_counter_threshold, uint, S_IRUGO);
MODULE_PARM_DESC(uvm_perf_access_counter_threshold,
"Number of remote accesses on a region required to trigger a notification."
@@ -136,7 +125,7 @@ static va_space_access_counters_info_t *va_space_access_counters_info_get(uvm_va
// Whether access counter migrations are enabled or not. The policy is as
// follows:
// - MIMC migrations are enabled by default on P9 systems with ATS support
// - MIMC migrations are disabled by default on all systems except P9.
// - MOMC migrations are disabled by default on all systems
// - Users can override this policy by specifying on/off
static bool is_migration_enabled(uvm_access_counter_type_t type)
@@ -159,7 +148,10 @@ static bool is_migration_enabled(uvm_access_counter_type_t type)
if (type == UVM_ACCESS_COUNTER_TYPE_MOMC)
return false;
return g_uvm_global.ats.supported;
if (UVM_ATS_IBM_SUPPORTED())
return g_uvm_global.ats.supported;
return false;
}
// Create the access counters tracking struct for the given VA space
@@ -225,30 +217,18 @@ static NV_STATUS config_granularity_to_bytes(UVM_ACCESS_COUNTER_GRANULARITY gran
return NV_OK;
}
// Clear the given access counter and add it to the per-GPU clear tracker
static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *entry)
// Clear the access counter notifications and add it to the per-GPU clear
// tracker.
static NV_STATUS access_counter_clear_notifications(uvm_gpu_t *gpu,
uvm_access_counter_buffer_entry_t **notification_start,
NvU32 num_notifications)
{
NvU32 i;
NV_STATUS status;
uvm_push_t push;
uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
if (entry->address.is_virtual) {
status = uvm_push_begin(gpu->channel_manager,
UVM_CHANNEL_TYPE_MEMOPS,
&push,
"Clear access counter with virtual address: 0x%llx",
entry->address.address);
}
else {
status = uvm_push_begin(gpu->channel_manager,
UVM_CHANNEL_TYPE_MEMOPS,
&push,
"Clear access counter with physical address: 0x%llx:%s",
entry->address.address,
uvm_aperture_string(entry->address.aperture));
}
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &push, "Clear access counter batch");
if (status != NV_OK) {
UVM_ERR_PRINT("Error creating push to clear access counters: %s, GPU %s\n",
nvstatusToString(status),
@@ -256,7 +236,8 @@ static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
return status;
}
gpu->parent->host_hal->access_counter_clear_targeted(&push, entry);
for (i = 0; i < num_notifications; i++)
gpu->parent->host_hal->access_counter_clear_targeted(&push, notification_start[i]);
uvm_push_end(&push);
@@ -381,25 +362,6 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
g_uvm_access_counter_threshold = uvm_perf_access_counter_threshold;
}
if (strcmp(uvm_perf_access_counter_granularity, "64k") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_64K;
}
else if (strcmp(uvm_perf_access_counter_granularity, "2m") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
}
else if (strcmp(uvm_perf_access_counter_granularity, "16m") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16M;
}
else if (strcmp(uvm_perf_access_counter_granularity, "16g") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16G;
}
else {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
pr_info("Invalid value '%s' for uvm_perf_access_counter_granularity, using '%s' instead",
uvm_perf_access_counter_granularity,
UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT);
}
uvm_assert_mutex_locked(&g_uvm_global.global_lock);
UVM_ASSERT(parent_gpu->access_counter_buffer_hal != NULL);
@@ -422,7 +384,7 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
UVM_ASSERT(access_counters->rm_info.bufferSize %
parent_gpu->access_counter_buffer_hal->entry_size(parent_gpu) == 0);
status = config_granularity_to_bytes(g_uvm_access_counter_granularity, &granularity_bytes);
status = config_granularity_to_bytes(UVM_PERF_ACCESS_COUNTER_GRANULARITY, &granularity_bytes);
UVM_ASSERT(status == NV_OK);
if (granularity_bytes > UVM_MAX_TRANSLATION_SIZE)
UVM_ASSERT(granularity_bytes % UVM_MAX_TRANSLATION_SIZE == 0);
@@ -641,8 +603,8 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac
else {
UvmGpuAccessCntrConfig default_config =
{
.mimcGranularity = g_uvm_access_counter_granularity,
.momcGranularity = g_uvm_access_counter_granularity,
.mimcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
.momcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
.mimcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
.momcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
.threshold = g_uvm_access_counter_threshold,
@@ -767,6 +729,22 @@ static int cmp_sort_virt_notifications_by_instance_ptr(const void *_a, const voi
return cmp_access_counter_instance_ptr(a, b);
}
// Sort comparator for pointers to GVA access counter notification buffer
// entries that sorts by va_space, and fault address.
static int cmp_sort_virt_notifications_by_va_space_address(const void *_a, const void *_b)
{
const uvm_access_counter_buffer_entry_t **a = (const uvm_access_counter_buffer_entry_t **)_a;
const uvm_access_counter_buffer_entry_t **b = (const uvm_access_counter_buffer_entry_t **)_b;
int result;
result = UVM_CMP_DEFAULT((*a)->virtual_info.va_space, (*b)->virtual_info.va_space);
if (result != 0)
return result;
return UVM_CMP_DEFAULT((*a)->address.address, (*b)->address.address);
}
// Sort comparator for pointers to GPA access counter notification buffer
// entries that sorts by physical address' aperture
static int cmp_sort_phys_notifications_by_processor_id(const void *_a, const void *_b)
@@ -924,12 +902,11 @@ static void translate_virt_notifications_instance_ptrs(uvm_gpu_t *gpu,
// GVA notifications provide an instance_ptr and ve_id that can be directly
// translated to a VA space. In order to minimize translations, we sort the
// entries by instance_ptr.
// entries by instance_ptr, va_space and notification address in that order.
static void preprocess_virt_notifications(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context)
{
if (!batch_context->virt.is_single_instance_ptr) {
// Sort by instance_ptr
sort(batch_context->virt.notifications,
batch_context->virt.num_notifications,
sizeof(*batch_context->virt.notifications),
@@ -938,6 +915,12 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
}
translate_virt_notifications_instance_ptrs(gpu, batch_context);
sort(batch_context->virt.notifications,
batch_context->virt.num_notifications,
sizeof(*batch_context->virt.notifications),
cmp_sort_virt_notifications_by_va_space_address,
NULL);
}
// GPA notifications provide a physical address and an aperture. Sort
@@ -946,7 +929,6 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
static void preprocess_phys_notifications(uvm_access_counter_service_batch_context_t *batch_context)
{
if (!batch_context->phys.is_single_aperture) {
// Sort by instance_ptr
sort(batch_context->phys.notifications,
batch_context->phys.num_notifications,
sizeof(*batch_context->phys.notifications),
@@ -955,6 +937,28 @@ static void preprocess_phys_notifications(uvm_access_counter_service_batch_conte
}
}
static NV_STATUS notify_tools_and_process_flags(uvm_gpu_t *gpu,
uvm_access_counter_buffer_entry_t **notification_start,
NvU32 num_entries,
NvU32 flags)
{
NV_STATUS status = NV_OK;
if (uvm_enable_builtin_tests) {
// TODO: Bug 4310744: [UVM][TOOLS] Attribute access counter tools events
// to va_space instead of broadcasting.
NvU32 i;
for (i = 0; i < num_entries; i++)
uvm_tools_broadcast_access_counter(gpu, notification_start[i], flags & UVM_ACCESS_COUNTER_PHYS_ON_MANAGED);
}
if (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR)
status = access_counter_clear_notifications(gpu, notification_start, num_entries);
return status;
}
static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
uvm_va_block_t *va_block,
uvm_va_block_retry_t *va_block_retry,
@@ -1163,7 +1167,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *current_entry,
const uvm_reverse_map_t *reverse_mappings,
size_t num_reverse_mappings,
unsigned *out_flags)
NvU32 *out_flags)
{
size_t index;
uvm_va_block_t *va_block = reverse_mappings[0].va_block;
@@ -1190,7 +1194,6 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
// If an mm is registered with the VA space, we have to retain it
// in order to lock it before locking the VA space.
mm = uvm_va_space_mm_retain_lock(va_space);
uvm_va_space_down_read(va_space);
// Re-check that the VA block is valid after taking the VA block lock.
@@ -1251,7 +1254,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *current_entry,
const uvm_reverse_map_t *reverse_mappings,
size_t num_reverse_mappings,
unsigned *out_flags)
NvU32 *out_flags)
{
NV_STATUS status = NV_OK;
size_t index;
@@ -1259,7 +1262,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
*out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;
for (index = 0; index < num_reverse_mappings; ++index) {
unsigned out_flags_local = 0;
NvU32 out_flags_local = 0;
status = service_phys_single_va_block(gpu,
batch_context,
current_entry,
@@ -1318,7 +1321,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
NvU64 address,
unsigned long sub_granularity,
size_t *num_reverse_mappings,
unsigned *out_flags)
NvU32 *out_flags)
{
NV_STATUS status;
NvU32 region_start, region_end;
@@ -1327,7 +1330,10 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
// Get the reverse_map translations for all the regions set in the
// sub_granularity field of the counter.
for_each_sub_granularity_region(region_start, region_end, sub_granularity, config->sub_granularity_regions_per_translation) {
for_each_sub_granularity_region(region_start,
region_end,
sub_granularity,
config->sub_granularity_regions_per_translation) {
NvU64 local_address = address + region_start * config->sub_granularity_region_size;
NvU32 local_translation_size = (region_end - region_start) * config->sub_granularity_region_size;
uvm_reverse_map_t *local_reverse_mappings = batch_context->phys.translations + *num_reverse_mappings;
@@ -1376,7 +1382,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context,
const uvm_access_counter_buffer_entry_t *current_entry,
unsigned *out_flags)
NvU32 *out_flags)
{
NvU64 address;
NvU64 translation_index;
@@ -1387,7 +1393,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
size_t total_reverse_mappings = 0;
uvm_gpu_t *resident_gpu = NULL;
NV_STATUS status = NV_OK;
unsigned flags = 0;
NvU32 flags = 0;
address = current_entry->address.address;
UVM_ASSERT(address % config->translation_size == 0);
@@ -1415,7 +1421,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
for (translation_index = 0; translation_index < config->translations_per_counter; ++translation_index) {
size_t num_reverse_mappings;
unsigned out_flags_local = 0;
NvU32 out_flags_local = 0;
status = service_phys_notification_translation(gpu,
resident_gpu,
batch_context,
@@ -1437,11 +1443,8 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
sub_granularity = sub_granularity >> config->sub_granularity_regions_per_translation;
}
// Currently we only report events for our tests, not for tools
if (uvm_enable_builtin_tests) {
*out_flags |= UVM_ACCESS_COUNTER_ACTION_NOTIFY;
*out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_ON_MANAGED : 0);
}
if (uvm_enable_builtin_tests)
*out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_PHYS_ON_MANAGED : 0);
if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
*out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
@@ -1454,22 +1457,21 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context)
{
NvU32 i;
uvm_access_counter_buffer_entry_t **notifications = batch_context->phys.notifications;
preprocess_phys_notifications(batch_context);
for (i = 0; i < batch_context->phys.num_notifications; ++i) {
NV_STATUS status;
uvm_access_counter_buffer_entry_t *current_entry = batch_context->phys.notifications[i];
unsigned flags = 0;
uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
NvU32 flags = 0;
if (!UVM_ID_IS_VALID(current_entry->physical_info.resident_id))
continue;
status = service_phys_notification(gpu, batch_context, current_entry, &flags);
if (flags & UVM_ACCESS_COUNTER_ACTION_NOTIFY)
uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
status = access_counter_clear_targeted(gpu, current_entry);
notify_tools_and_process_flags(gpu, &notifications[i], 1, flags);
if (status != NV_OK)
return status;
@@ -1478,152 +1480,218 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
return NV_OK;
}
static int cmp_sort_gpu_phys_addr(const void *_a, const void *_b)
static NV_STATUS service_notification_va_block_helper(struct mm_struct *mm,
uvm_va_block_t *va_block,
uvm_processor_id_t processor,
uvm_access_counter_service_batch_context_t *batch_context)
{
return uvm_gpu_phys_addr_cmp(*(uvm_gpu_phys_address_t*)_a,
*(uvm_gpu_phys_address_t*)_b);
}
uvm_va_block_retry_t va_block_retry;
uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
uvm_service_block_context_t *service_context = &batch_context->block_service_context;
static bool gpu_phys_same_region(uvm_gpu_phys_address_t a, uvm_gpu_phys_address_t b, NvU64 granularity)
{
if (a.aperture != b.aperture)
return false;
UVM_ASSERT(is_power_of_2(granularity));
return UVM_ALIGN_DOWN(a.address, granularity) == UVM_ALIGN_DOWN(b.address, granularity);
}
static bool phys_address_in_accessed_sub_region(uvm_gpu_phys_address_t address,
NvU64 region_size,
NvU64 sub_region_size,
NvU32 accessed_mask)
{
const unsigned accessed_index = (address.address % region_size) / sub_region_size;
// accessed_mask is only filled for tracking granularities larger than 64K
if (region_size == UVM_PAGE_SIZE_64K)
return true;
UVM_ASSERT(accessed_index < 32);
return ((1 << accessed_index) & accessed_mask) != 0;
}
static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context,
const uvm_access_counter_buffer_entry_t *current_entry,
unsigned *out_flags)
{
NV_STATUS status = NV_OK;
NvU64 notification_size;
NvU64 address;
uvm_processor_id_t *resident_processors = batch_context->virt.scratch.resident_processors;
uvm_gpu_phys_address_t *phys_addresses = batch_context->virt.scratch.phys_addresses;
int num_addresses = 0;
int i;
// Virtual address notifications are always 64K aligned
NvU64 region_start = current_entry->address.address;
NvU64 region_end = current_entry->address.address + UVM_PAGE_SIZE_64K;
uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
uvm_access_counter_type_t counter_type = current_entry->counter_type;
const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters, counter_type);
uvm_va_space_t *va_space = current_entry->virtual_info.va_space;
UVM_ASSERT(counter_type == UVM_ACCESS_COUNTER_TYPE_MIMC);
// Entries with NULL va_space are simply dropped.
if (!va_space)
if (uvm_page_mask_empty(accessed_pages))
return NV_OK;
status = config_granularity_to_bytes(config->rm.granularity, &notification_size);
if (status != NV_OK)
return status;
uvm_assert_mutex_locked(&va_block->lock);
// Collect physical locations that could have been touched
// in the reported 64K VA region. The notification mask can
// correspond to any of them.
uvm_va_space_down_read(va_space);
for (address = region_start; address < region_end;) {
uvm_va_block_t *va_block;
service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
service_context->num_retries = 0;
service_context->block_context.mm = mm;
NV_STATUS local_status = uvm_va_block_find(va_space, address, &va_block);
if (local_status == NV_ERR_INVALID_ADDRESS || local_status == NV_ERR_OBJECT_NOT_FOUND) {
address += PAGE_SIZE;
continue;
}
return UVM_VA_BLOCK_RETRY_LOCKED(va_block,
&va_block_retry,
service_va_block_locked(processor,
va_block,
&va_block_retry,
service_context,
accessed_pages));
}
uvm_mutex_lock(&va_block->lock);
while (address < va_block->end && address < region_end) {
const unsigned page_index = uvm_va_block_cpu_page_index(va_block, address);
static void expand_notification_block(struct mm_struct *mm,
uvm_gpu_va_space_t *gpu_va_space,
uvm_va_block_t *va_block,
uvm_page_mask_t *accessed_pages,
const uvm_access_counter_buffer_entry_t *current_entry)
{
NvU64 addr;
NvU64 granularity = 0;
uvm_gpu_t *resident_gpu = NULL;
uvm_processor_id_t resident_id;
uvm_page_index_t page_index;
uvm_gpu_t *gpu = gpu_va_space->gpu;
const uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters,
UVM_ACCESS_COUNTER_TYPE_MIMC);
// UVM va_block always maps the closest resident location to processor
const uvm_processor_id_t res_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
config_granularity_to_bytes(config->rm.granularity, &granularity);
// Add physical location if it's valid and not local vidmem
if (UVM_ID_IS_VALID(res_id) && !uvm_id_equal(res_id, gpu->id)) {
uvm_gpu_phys_address_t phys_address = uvm_va_block_res_phys_page_address(va_block, page_index, res_id, gpu);
if (phys_address_in_accessed_sub_region(phys_address,
notification_size,
config->sub_granularity_region_size,
current_entry->sub_granularity)) {
resident_processors[num_addresses] = res_id;
phys_addresses[num_addresses] = phys_address;
++num_addresses;
}
else {
UVM_DBG_PRINT_RL("Skipping phys address %llx:%s, because it couldn't have been accessed in mask %x",
phys_address.address,
uvm_aperture_string(phys_address.aperture),
current_entry->sub_granularity);
}
}
// Granularities other than 2MB can only be enabled by UVM tests. Do nothing
// in that case.
if (granularity != UVM_PAGE_SIZE_2M)
return;
address += PAGE_SIZE;
}
uvm_mutex_unlock(&va_block->lock);
addr = current_entry->address.address;
uvm_assert_rwsem_locked(&gpu_va_space->va_space->lock);
uvm_assert_mutex_locked(&va_block->lock);
page_index = uvm_va_block_cpu_page_index(va_block, addr);
resident_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
// resident_id might be invalid or might already be the same as the GPU
// which received the notification if the memory was already migrated before
// acquiring the locks either during the servicing of previous notifications
// or during faults or because of explicit migrations or if the VA range was
// freed after receving the notification. Return NV_OK in such cases.
if (!UVM_ID_IS_VALID(resident_id) || uvm_id_equal(resident_id, gpu->id))
return;
if (UVM_ID_IS_GPU(resident_id))
resident_gpu = uvm_va_space_get_gpu(gpu_va_space->va_space, resident_id);
if (uvm_va_block_get_physical_size(va_block, resident_id, page_index) != granularity) {
uvm_page_mask_set(accessed_pages, page_index);
}
uvm_va_space_up_read(va_space);
else {
NvU32 region_start;
NvU32 region_end;
unsigned long sub_granularity = current_entry->sub_granularity;
NvU32 num_regions = config->sub_granularity_regions_per_translation;
NvU32 num_sub_pages = config->sub_granularity_region_size / PAGE_SIZE;
uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
// The addresses need to be sorted to aid coalescing.
sort(phys_addresses,
num_addresses,
sizeof(*phys_addresses),
cmp_sort_gpu_phys_addr,
NULL);
UVM_ASSERT(num_sub_pages >= 1);
for (i = 0; i < num_addresses; ++i) {
uvm_access_counter_buffer_entry_t *fake_entry = &batch_context->virt.scratch.phys_entry;
// Skip the current pointer if the physical region was already handled
if (i > 0 && gpu_phys_same_region(phys_addresses[i - 1], phys_addresses[i], notification_size)) {
UVM_ASSERT(uvm_id_equal(resident_processors[i - 1], resident_processors[i]));
continue;
// region_start and region_end refer to sub_granularity indices, not
// page_indices.
for_each_sub_granularity_region(region_start, region_end, sub_granularity, num_regions) {
uvm_page_mask_region_fill(accessed_pages,
uvm_va_block_region(region_start * num_sub_pages,
region_end * num_sub_pages));
}
UVM_DBG_PRINT_RL("Faking MIMC address[%i/%i]: %llx (granularity mask: %llx) in aperture %s on device %s\n",
i,
num_addresses,
phys_addresses[i].address,
notification_size - 1,
uvm_aperture_string(phys_addresses[i].aperture),
uvm_gpu_name(gpu));
// Construct a fake phys addr AC entry
fake_entry->counter_type = current_entry->counter_type;
fake_entry->address.address = UVM_ALIGN_DOWN(phys_addresses[i].address, notification_size);
fake_entry->address.aperture = phys_addresses[i].aperture;
fake_entry->address.is_virtual = false;
fake_entry->physical_info.resident_id = resident_processors[i];
fake_entry->counter_value = current_entry->counter_value;
fake_entry->sub_granularity = current_entry->sub_granularity;
// Remove pages in the va_block which are not resident on resident_id.
// If the GPU is heavily accessing those pages, future access counter
// migrations will migrate them to the GPU.
uvm_page_mask_and(accessed_pages, accessed_pages, resident_mask);
}
}
status = service_phys_notification(gpu, batch_context, fake_entry, out_flags);
if (status != NV_OK)
static NV_STATUS service_virt_notifications_in_block(struct mm_struct *mm,
uvm_gpu_va_space_t *gpu_va_space,
uvm_va_block_t *va_block,
uvm_access_counter_service_batch_context_t *batch_context,
NvU32 index,
NvU32 *out_index)
{
NvU32 i = index;
NvU32 flags = 0;
NV_STATUS status = NV_OK;
NV_STATUS flags_status;
uvm_gpu_t *gpu = gpu_va_space->gpu;
uvm_va_space_t *va_space = gpu_va_space->va_space;
uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
uvm_access_counter_buffer_entry_t **notifications = batch_context->virt.notifications;
UVM_ASSERT(va_block);
UVM_ASSERT(i < batch_context->virt.num_notifications);
uvm_assert_rwsem_locked(&va_space->lock);
uvm_page_mask_zero(accessed_pages);
uvm_mutex_lock(&va_block->lock);
while (i < batch_context->virt.num_notifications) {
uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
NvU64 address = current_entry->address.address;
if ((current_entry->virtual_info.va_space != va_space) || (address > va_block->end)) {
*out_index = i;
break;
}
expand_notification_block(mm, gpu_va_space, va_block, accessed_pages, current_entry);
i++;
*out_index = i;
}
status = service_notification_va_block_helper(mm, va_block, gpu->id, batch_context);
uvm_mutex_unlock(&va_block->lock);
// Atleast one notification should have been processed.
UVM_ASSERT(index < *out_index);
if (status == NV_OK)
flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
flags_status = notify_tools_and_process_flags(gpu, &notifications[index], *out_index - index, flags);
if ((status == NV_OK) && (flags_status != NV_OK))
status = flags_status;
return status;
}
static NV_STATUS service_virt_notifications_batch(struct mm_struct *mm,
uvm_gpu_va_space_t *gpu_va_space,
uvm_access_counter_service_batch_context_t *batch_context,
NvU32 index,
NvU32 *out_index)
{
NV_STATUS status;
uvm_va_block_t *va_block;
uvm_va_space_t *va_space = gpu_va_space->va_space;
uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[index];
NvU64 address = current_entry->address.address;
UVM_ASSERT(va_space);
uvm_assert_rwsem_locked(&va_space->lock);
// Virtual address notifications are always 64K aligned
UVM_ASSERT(IS_ALIGNED(address, UVM_PAGE_SIZE_64K));
// TODO: Bug 4309292: [UVM][HMM] Re-enable access counter HMM block
// migrations for virtual notifications on configs with
// 4KB page size
status = uvm_va_block_find(va_space, address, &va_block);
if ((status == NV_OK) && !uvm_va_block_is_hmm(va_block)) {
UVM_ASSERT(va_block);
status = service_virt_notifications_in_block(mm, gpu_va_space, va_block, batch_context, index, out_index);
}
else {
NvU32 flags = 0;
UVM_ASSERT((status == NV_ERR_OBJECT_NOT_FOUND) ||
(status == NV_ERR_INVALID_ADDRESS) ||
uvm_va_block_is_hmm(va_block));
// NV_ERR_OBJECT_NOT_FOUND is returned if the VA range is valid but no
// VA block has been allocated yet. This can happen if there are stale
// notifications in the batch. A new VA range may have been allocated in
// that range. So, clear the notification entry to continue getting
// notifications for the new VA range.
if (status == NV_ERR_OBJECT_NOT_FOUND)
flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
// NV_ERR_INVALID_ADDRESS is returned if the corresponding VA range
// doesn't exist or it's not a managed range. Access counter migrations
// are not currently supported on such ranges.
//
// TODO: Bug 1990466: [uvm] Use access counters to trigger migrations
// When support for SAM migrations is addded, clear the notification
// entry if the VA range doesn't exist in order to receive notifications
// when a new VA range is allocated in that region.
status = notify_tools_and_process_flags(gpu_va_space->gpu, &batch_context->virt.notifications[index], 1, flags);
*out_index = index + 1;
status = NV_OK;
}
return status;
@@ -1632,33 +1700,67 @@ static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context)
{
NvU32 i;
NvU32 i = 0;
NV_STATUS status = NV_OK;
struct mm_struct *mm = NULL;
uvm_va_space_t *va_space = NULL;
uvm_va_space_t *prev_va_space = NULL;
uvm_gpu_va_space_t *gpu_va_space = NULL;
// TODO: Bug 4299018 : Add support for virtual access counter migrations on
// 4K page sizes.
if (PAGE_SIZE == UVM_PAGE_SIZE_4K) {
return notify_tools_and_process_flags(gpu,
batch_context->virt.notifications,
batch_context->virt.num_notifications,
0);
}
preprocess_virt_notifications(gpu, batch_context);
for (i = 0; i < batch_context->virt.num_notifications; ++i) {
unsigned flags = 0;
while (i < batch_context->virt.num_notifications) {
uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[i];
va_space = current_entry->virtual_info.va_space;
status = service_virt_notification(gpu, batch_context, current_entry, &flags);
if (va_space != prev_va_space) {
UVM_DBG_PRINT_RL("Processed virt access counter (%d/%d): %sMANAGED (status: %d) clear: %s\n",
i + 1,
batch_context->virt.num_notifications,
(flags & UVM_ACCESS_COUNTER_ON_MANAGED) ? "" : "NOT ",
status,
(flags & UVM_ACCESS_COUNTER_ACTION_CLEAR) ? "YES" : "NO");
// New va_space detected, drop locks of the old va_space.
if (prev_va_space) {
uvm_va_space_up_read(prev_va_space);
uvm_va_space_mm_release_unlock(prev_va_space, mm);
if (uvm_enable_builtin_tests)
uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
mm = NULL;
gpu_va_space = NULL;
}
if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
status = access_counter_clear_targeted(gpu, current_entry);
// Acquire locks for the new va_space.
if (va_space) {
mm = uvm_va_space_mm_retain_lock(va_space);
uvm_va_space_down_read(va_space);
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
}
prev_va_space = va_space;
}
if (va_space && gpu_va_space && uvm_va_space_has_access_counter_migrations(va_space)) {
status = service_virt_notifications_batch(mm, gpu_va_space, batch_context, i, &i);
}
else {
status = notify_tools_and_process_flags(gpu, &batch_context->virt.notifications[i], 1, 0);
i++;
}
if (status != NV_OK)
break;
}
if (va_space) {
uvm_va_space_up_read(va_space);
uvm_va_space_mm_release_unlock(va_space, mm);
}
return status;
}
@@ -1941,6 +2043,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
}
else {
uvm_access_counter_buffer_entry_t entry = { 0 };
uvm_access_counter_buffer_entry_t *notification = &entry;
if (params->counter_type == UVM_TEST_ACCESS_COUNTER_TYPE_MIMC)
entry.counter_type = UVM_ACCESS_COUNTER_TYPE_MIMC;
@@ -1950,7 +2053,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
entry.bank = params->bank;
entry.tag = params->tag;
status = access_counter_clear_targeted(gpu, &entry);
status = access_counter_clear_notifications(gpu, &notification, 1);
}
if (status == NV_OK)

View File

@@ -235,17 +235,27 @@ static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *par
return NV_OK;
}
// In SRIOV, the UVM (guest) driver does not have access to the privileged
// registers used to clear the faulted bit. Instead, UVM requests host RM to do
// the clearing on its behalf, using a SW method.
static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
{
if (uvm_gpu_is_virt_mode_sriov(gpu)) {
UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
return true;
}
// If true, UVM uses a SW method to request RM to do the clearing on its
// behalf.
bool use_sw_method = false;
return false;
// In SRIOV, the UVM (guest) driver does not have access to the privileged
// registers used to clear the faulted bit.
if (uvm_gpu_is_virt_mode_sriov(gpu))
use_sw_method = true;
// In Confidential Computing access to the privileged registers is blocked,
// in order to prevent interference between guests, or between the
// (untrusted) host and the guests.
if (g_uvm_global.conf_computing_enabled)
use_sw_method = true;
if (use_sw_method)
UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
return use_sw_method;
}
static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
@@ -570,7 +580,7 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;
ats_invalidate->write_faults_in_batch = false;
ats_invalidate->tlb_batch_pending = false;
va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);

View File

@@ -1180,7 +1180,11 @@ static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
fault_entry->replayable.cancel_va_mode = cancel_va_mode;
utlb->has_fatal_faults = true;
batch_context->has_fatal_faults = true;
if (!batch_context->fatal_va_space) {
UVM_ASSERT(fault_entry->va_space);
batch_context->fatal_va_space = fault_entry->va_space;
}
}
static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
@@ -1511,7 +1515,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
++block_context->num_retries;
if (status == NV_OK && batch_context->has_fatal_faults)
if (status == NV_OK && batch_context->fatal_va_space)
status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);
return status;
@@ -1937,14 +1941,198 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
return status;
}
// Called when a fault in the batch has been marked fatal. Flush the buffer
// under the VA and mmap locks to remove any potential stale fatal faults, then
// service all new faults for just that VA space and cancel those which are
// fatal. Faults in other VA spaces are replayed when done and will be processed
// when normal fault servicing resumes.
static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
NV_STATUS status = NV_OK;
NvU32 i;
uvm_va_space_t *va_space = batch_context->fatal_va_space;
uvm_gpu_va_space_t *gpu_va_space = NULL;
struct mm_struct *mm;
uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
uvm_va_block_context_t *va_block_context = &service_context->block_context;
UVM_ASSERT(gpu->parent->replayable_faults_supported);
UVM_ASSERT(va_space);
// Perform the flush and re-fetch while holding the mmap_lock and the
// VA space lock. This avoids stale faults because it prevents any vma
// modifications (mmap, munmap, mprotect) from happening between the time HW
// takes the fault and we cancel it.
mm = uvm_va_space_mm_retain_lock(va_space);
va_block_context->mm = mm;
uvm_va_space_down_read(va_space);
// We saw fatal faults in this VA space before. Flush while holding
// mmap_lock to make sure those faults come back (aren't stale).
//
// We need to wait until all old fault messages have arrived before
// flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
if (status != NV_OK)
goto done;
// Wait for the flush's replay to finish to give the legitimate faults a
// chance to show up in the buffer again.
status = uvm_tracker_wait(&replayable_faults->replay_tracker);
if (status != NV_OK)
goto done;
// We expect all replayed faults to have arrived in the buffer so we can re-
// service them. The replay-and-wait sequence above will ensure they're all
// in the HW buffer. When GSP owns the HW buffer, we also have to wait for
// GSP to copy all available faults from the HW buffer into the shadow
// buffer.
//
// TODO: Bug 2533557: This flush does not actually guarantee that GSP will
// copy over all faults.
status = hw_fault_buffer_flush_locked(gpu->parent);
if (status != NV_OK)
goto done;
// If there is no GPU VA space for the GPU, ignore all faults in the VA
// space. This can happen if the GPU VA space has been destroyed since we
// unlocked the VA space in service_fault_batch. That means the fatal faults
// are stale, because unregistering the GPU VA space requires preempting the
// context and detaching all channels in that VA space. Restart fault
// servicing from the top.
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
if (!gpu_va_space)
goto done;
// Re-parse the new faults
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_duplicate_faults = 0;
batch_context->num_replays = 0;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
if (status != NV_OK)
goto done;
// No more faults left. Either the previously-seen fatal entry was stale, or
// RM killed the context underneath us.
if (batch_context->num_cached_faults == 0)
goto done;
++batch_context->batch_id;
status = preprocess_fault_batch(gpu, batch_context);
if (status != NV_OK) {
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
// Another flush happened due to stale faults or a context-fatal
// error. The previously-seen fatal fault might not exist anymore,
// so restart fault servicing from the top.
status = NV_OK;
}
goto done;
}
// Search for the target VA space
for (i = 0; i < batch_context->num_coalesced_faults; i++) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
UVM_ASSERT(current_entry->va_space);
if (current_entry->va_space == va_space)
break;
}
while (i < batch_context->num_coalesced_faults) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
if (current_entry->va_space != va_space)
break;
// service_fault_batch_dispatch() doesn't expect unserviceable faults.
// Just cancel them directly.
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
if (status != NV_OK)
break;
++i;
}
else {
uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
NvU32 block_faults;
ats_invalidate->tlb_batch_pending = false;
uvm_hmm_service_context_init(service_context);
// Service all the faults that we can. We only really need to search
// for fatal faults, but attempting to service all is the easiest
// way to do that.
status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false);
if (status != NV_OK) {
// TODO: Bug 3900733: clean up locking in service_fault_batch().
// We need to drop lock and retry. That means flushing and
// starting over.
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
status = NV_OK;
break;
}
// Invalidate TLBs before cancel to ensure that fatal faults don't
// get stuck in HW behind non-fatal faults to the same line.
status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
if (status != NV_OK)
break;
while (block_faults-- > 0) {
current_entry = batch_context->ordered_fault_cache[i];
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
if (status != NV_OK)
break;
}
++i;
}
}
}
done:
uvm_va_space_up_read(va_space);
uvm_va_space_mm_release_unlock(va_space, mm);
if (status == NV_OK) {
// There are two reasons to flush the fault buffer here.
//
// 1) Functional. We need to replay both the serviced non-fatal faults
// and the skipped faults in other VA spaces. The former need to be
// restarted and the latter need to be replayed so the normal fault
// service mechanism can fetch and process them.
//
// 2) Performance. After cancelling the fatal faults, a flush removes
// any potential duplicated fault that may have been added while
// processing the faults in this batch. This flush also avoids doing
// unnecessary processing after the fatal faults have been cancelled,
// so all the rest are unlikely to remain after a replay because the
// context is probably in the process of dying.
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
}
return status;
}
// Scan the ordered view of faults and group them by different va_blocks
// (managed faults) and service faults for each va_block, in batch.
// Service non-managed faults one at a time as they are encountered during the
// scan.
//
// This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
// was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
// space
// Fatal faults are marked for later processing by the caller.
static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
fault_service_mode_t service_mode,
uvm_fault_service_batch_context_t *batch_context)
@@ -1963,7 +2151,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
UVM_ASSERT(gpu->parent->replayable_faults_supported);
ats_invalidate->write_faults_in_batch = false;
ats_invalidate->tlb_batch_pending = false;
uvm_hmm_service_context_init(service_context);
for (i = 0; i < batch_context->num_coalesced_faults;) {
@@ -1998,38 +2186,25 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
va_block_context->mm = mm;
uvm_va_space_down_read(va_space);
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
if (status == NV_OK)
status = NV_WARN_MORE_PROCESSING_REQUIRED;
break;
}
// The case where there is no valid GPU VA space for the GPU in this
// VA space is handled next
}
// Some faults could be already fatal if they cannot be handled by
// the UVM driver
if (current_entry->is_fatal) {
++i;
batch_context->has_fatal_faults = true;
if (!batch_context->fatal_va_space)
batch_context->fatal_va_space = va_space;
utlb->has_fatal_faults = true;
UVM_ASSERT(utlb->num_pending_faults > 0);
continue;
}
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
if (!gpu_va_space) {
// If there is no GPU VA space for the GPU, ignore the fault. This
// can happen if a GPU VA space is destroyed without explicitly
// freeing all memory ranges (destroying the VA range triggers a
// flush of the fault buffer) and there are stale entries in the
// freeing all memory ranges and there are stale entries in the
// buffer that got fixed by the servicing in a previous batch.
++i;
continue;
@@ -2057,7 +2232,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
i += block_faults;
// Don't issue replays in cancel mode
if (replay_per_va_block && !batch_context->has_fatal_faults) {
if (replay_per_va_block && !batch_context->fatal_va_space) {
status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
if (status != NV_OK)
goto fail;
@@ -2069,8 +2244,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
}
}
// Only clobber status if invalidate_status != NV_OK, since status may also
// contain NV_WARN_MORE_PROCESSING_REQUIRED.
if (va_space != NULL) {
NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
if (invalidate_status != NV_OK)
@@ -2278,64 +2451,6 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
return false;
}
// Cancel just the faults flagged as fatal in the given fault service batch
// context.
static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
NV_STATUS status = NV_OK;
NV_STATUS fault_status;
uvm_va_space_t *va_space = NULL;
NvU32 i;
UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
UVM_ASSERT(current_entry->va_space);
if (current_entry->va_space != va_space) {
// Fault on a different va_space, drop the lock of the old one...
if (va_space != NULL)
uvm_va_space_up_read(va_space);
va_space = current_entry->va_space;
// ... and take the lock of the new one
uvm_va_space_down_read(va_space);
// We don't need to check whether a buffer flush is required
// (due to VA range destruction). Once a fault is flagged as fatal
// we need to cancel it, even if its VA range no longer exists.
}
// See the comment for the same check in cancel_faults_all
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id))
continue;
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
if (status != NV_OK)
break;
}
}
if (va_space != NULL)
uvm_va_space_up_read(va_space);
// See the comment on flushing in cancel_faults_all
fault_status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
// We report the first encountered error.
if (status == NV_OK)
status = fault_status;
return status;
}
// Cancel all faults in the given fault service batch context, even those not
// marked as fatal.
static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
@@ -2344,56 +2459,51 @@ static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
{
NV_STATUS status = NV_OK;
NV_STATUS fault_status;
uvm_va_space_t *va_space = NULL;
NvU32 i;
NvU32 i = 0;
UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
while (i < batch_context->num_coalesced_faults && status == NV_OK) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
uvm_fault_cancel_va_mode_t cancel_va_mode;
uvm_va_space_t *va_space = current_entry->va_space;
bool skip_va_space;
UVM_ASSERT(current_entry->va_space);
UVM_ASSERT(va_space);
if (current_entry->va_space != va_space) {
// Fault on a different va_space, drop the lock of the old one...
if (va_space != NULL)
uvm_va_space_up_read(va_space);
uvm_va_space_down_read(va_space);
va_space = current_entry->va_space;
// If there is no GPU VA space for the GPU, ignore all faults in
// that VA space. This can happen if the GPU VA space has been
// destroyed since we unlocked the VA space in service_fault_batch.
// Ignoring the fault avoids targetting a PDB that might have been
// reused by another process.
skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
// ... and take the lock of the new one
uvm_va_space_down_read(va_space);
for (;
i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
current_entry = batch_context->ordered_fault_cache[++i]) {
uvm_fault_cancel_va_mode_t cancel_va_mode;
if (skip_va_space)
continue;
if (current_entry->is_fatal) {
UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
cancel_va_mode = current_entry->replayable.cancel_va_mode;
}
else {
current_entry->fatal_reason = reason;
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
}
status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
if (status != NV_OK)
break;
}
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
// If there is no GPU VA space for the GPU, ignore the fault.
// This can happen if the GPU VA did not exist in
// service_fault_batch(), or it was destroyed since then.
// This is to avoid targetting a PDB that might have been reused
// by another process.
continue;
}
// If the fault was already marked fatal, use its reason and cancel
// mode. Otherwise use the provided reason.
if (current_entry->is_fatal) {
UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
cancel_va_mode = current_entry->replayable.cancel_va_mode;
}
else {
current_entry->fatal_reason = reason;
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
}
status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
if (status != NV_OK)
break;
}
if (va_space != NULL)
uvm_va_space_up_read(va_space);
}
// Because each cancel itself triggers a replay, there may be a large number
// of new duplicated faults in the buffer after cancelling all the known
@@ -2537,7 +2647,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_replays = 0;
batch_context->has_fatal_faults = false;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
// 5) Fetch all faults from buffer
@@ -2584,9 +2694,6 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
// 8) Service all non-fatal faults and mark all non-serviceable faults
// as fatal
status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
continue;
UVM_ASSERT(batch_context->num_replays == 0);
if (status == NV_ERR_NO_MEMORY)
continue;
@@ -2594,7 +2701,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
break;
// No more fatal faults left, we are done
if (!batch_context->has_fatal_faults)
if (!batch_context->fatal_va_space)
break;
// 9) Search for uTLBs that contain fatal faults and meet the
@@ -2616,9 +2723,9 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
UVM_ASSERT(batch_context->has_fatal_faults);
UVM_ASSERT(batch_context->fatal_va_space);
if (gpu->parent->fault_cancel_va_supported)
return cancel_faults_precise_va(gpu, batch_context);
return service_fault_batch_for_cancel(gpu, batch_context);
return cancel_faults_precise_tlb(gpu, batch_context);
}
@@ -2674,7 +2781,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_duplicate_faults = 0;
batch_context->num_replays = 0;
batch_context->has_fatal_faults = false;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
@@ -2702,9 +2809,6 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
// was flushed
num_replays += batch_context->num_replays;
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
continue;
enable_disable_prefetch_faults(gpu->parent, batch_context);
if (status != NV_OK) {
@@ -2718,10 +2822,17 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
break;
}
if (batch_context->has_fatal_faults) {
if (batch_context->fatal_va_space) {
status = uvm_tracker_wait(&batch_context->tracker);
if (status == NV_OK)
if (status == NV_OK) {
status = cancel_faults_precise(gpu, batch_context);
if (status == NV_OK) {
// Cancel handling should've issued at least one replay
UVM_ASSERT(batch_context->num_replays > 0);
++num_batches;
continue;
}
}
break;
}

View File

@@ -103,5 +103,7 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->map_remap_larger_page_promotion = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = true;
}

View File

@@ -33,6 +33,7 @@
#include "uvm_types.h"
#include "uvm_global.h"
#include "uvm_common.h"
#include "uvm_hal.h"
#include "uvm_hal_types.h"
#include "uvm_hopper_fault_buffer.h"
@@ -42,6 +43,10 @@
#define MMU_BIG 0
#define MMU_SMALL 1
// Used in pde_pcf().
#define ATS_ALLOWED 0
#define ATS_NOT_ALLOWED 1
uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
{
if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST44)
@@ -260,7 +265,108 @@ static NvU64 poisoned_pte_hopper(void)
return WRITE_HWCONST64(pte_bits, _MMU_VER3, PTE, PCF, PRIVILEGE_RO_NO_ATOMIC_UNCACHED_ACD);
}
static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 depth)
typedef enum
{
PDE_TYPE_SINGLE,
PDE_TYPE_DUAL_BIG,
PDE_TYPE_DUAL_SMALL,
PDE_TYPE_COUNT,
} pde_type_t;
static const NvU8 valid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_ALLOWED,
NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_NOT_ALLOWED } };
static const NvU8 invalid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_INVALID_ATS_ALLOWED,
NV_MMU_VER3_PDE_PCF_INVALID_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_NOT_ALLOWED } };
static const NvU8 va_base[] = { 56, 47, 38, 29, 21 };
static bool is_ats_range_valid(uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_base_va;
NvU64 min_va_upper;
NvU64 max_va_lower;
NvU32 index_in_dir;
uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
UVM_ASSERT(dir->depth < ARRAY_SIZE(va_base));
// We can use UVM_PAGE_SIZE_AGNOSTIC because page_size is only used in
// index_bits_hopper() for PTE table, i.e., depth 5+, which does not use a
// PDE PCF or an ATS_ALLOWED/NOT_ALLOWED setting.
UVM_ASSERT(child_index < (1ull << index_bits_hopper(dir->depth, UVM_PAGE_SIZE_AGNOSTIC)));
pde_base_va = 0;
index_in_dir = child_index;
while (dir) {
pde_base_va += index_in_dir * (1ull << va_base[dir->depth]);
index_in_dir = dir->index_in_parent;
dir = dir->host_parent;
}
pde_base_va = (NvU64)((NvS64)(pde_base_va << (64 - num_va_bits_hopper())) >> (64 - num_va_bits_hopper()));
if (pde_base_va < max_va_lower || pde_base_va >= min_va_upper)
return true;
return false;
}
// PDE Permission Control Flags
static NvU32 pde_pcf(bool valid, pde_type_t pde_type, uvm_page_directory_t *dir, NvU32 child_index)
{
const NvU8 (*pcf)[2] = valid ? valid_pcf : invalid_pcf;
NvU8 depth = dir->depth;
UVM_ASSERT(pde_type < PDE_TYPE_COUNT);
UVM_ASSERT(depth < 5);
// On non-ATS systems, PDE PCF only sets the valid and volatile/cache bits.
if (!g_uvm_global.ats.enabled)
return pcf[pde_type][ATS_ALLOWED];
// We assume all supported ATS platforms use canonical form address.
// See comments in uvm_gpu.c:uvm_gpu_can_address() and in
// uvm_mmu.c:page_tree_ats_init();
UVM_ASSERT(uvm_platform_uses_canonical_form_address());
// Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
// ATS and GMMU page tables. For managed memory we need to prevent this
// parallel lookup since we would not get any GPU fault if the CPU has
// a valid mapping. Also, for external ranges that are known to be
// mapped entirely on the GMMU page table we can skip the ATS lookup
// for performance reasons. Parallel ATS lookup is disabled in PDE1
// (depth 3) and, therefore, it applies to the underlying 512MB VA
// range.
//
// UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
// This is fine because CUDA ensures that all managed and external
// allocations are properly compartmentalized in 512MB-aligned VA
// regions. For cudaHostRegister CUDA cannot control the VA range, but
// we rely on ATS for those allocations so they can't choose the
// ATS_NOT_ALLOWED mode.
// TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range to
// PTEs.
// HW complies with the leaf PDE's ATS_ALLOWED/ATS_NOT_ALLOWED settings,
// enabling us to treat any upper-level PDE as a don't care as long as there
// are leaf PDEs for the entire upper-level PDE range. We assume PDE4
// entries (depth == 0) are always ATS enabled, and the no_ats_range is in
// PDE3 or lower.
if (depth == 0 || (!valid && is_ats_range_valid(dir, child_index)))
return pcf[pde_type][ATS_ALLOWED];
return pcf[pde_type][ATS_NOT_ALLOWED];
}
static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_bits = 0;
@@ -280,38 +386,17 @@ static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 dep
break;
}
// PCF (permission control flags) 5:3
// Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
// ATS and GMMU page tables. For managed memory we need to prevent this
// parallel lookup since we would not get any GPU fault if the CPU has
// a valid mapping. Also, for external ranges that are known to be
// mapped entirely on the GMMU page table we can skip the ATS lookup
// for performance reasons. Parallel ATS lookup is disabled in PDE1
// (depth 3) and, therefore, it applies to the underlying 512MB VA
// range.
//
// UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
// This is fine because CUDA ensures that all managed and external
// allocations are properly compartmentalized in 512MB-aligned VA
// regions. For cudaHostRegister CUDA cannot control the VA range, but
// we rely on ATS for those allocations so they can't choose the
// ATS_NOT_ALLOWED mode.
//
// TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range
// to PTEs.
if (depth == 3 && g_uvm_global.ats.enabled)
pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_NOT_ALLOWED);
else
pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_ALLOWED);
// address 51:12
pde_bits |= HWVALUE64(_MMU_VER3, PDE, ADDRESS, address);
}
// PCF (permission control flags) 5:3
pde_bits |= HWVALUE64(_MMU_VER3, PDE, PCF, pde_pcf(phys_alloc != NULL, PDE_TYPE_SINGLE, dir, child_index));
return pde_bits;
}
static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_bits = 0;
@@ -330,17 +415,20 @@ static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
break;
}
// PCF (permission control flags) 5:3
pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_BIG, VALID_UNCACHED_ATS_NOT_ALLOWED);
// address 51:8
pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_BIG, address);
}
// PCF (permission control flags) 5:3
pde_bits |= HWVALUE64(_MMU_VER3,
DUAL_PDE,
PCF_BIG,
pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_BIG, dir, child_index));
return pde_bits;
}
static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_bits = 0;
@@ -359,32 +447,40 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
break;
}
// PCF (permission control flags) 69:67 [5:3]
pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_SMALL, VALID_UNCACHED_ATS_NOT_ALLOWED);
// address 115:76 [51:12]
pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_SMALL, address);
}
// PCF (permission control flags) 69:67 [5:3]
pde_bits |= HWVALUE64(_MMU_VER3,
DUAL_PDE,
PCF_SMALL,
pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_SMALL, dir, child_index));
return pde_bits;
}
static void make_pde_hopper(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU32 entry_count = entries_per_index_hopper(depth);
NvU32 entry_count;
NvU64 *entry_bits = (NvU64 *)entry;
UVM_ASSERT(dir);
entry_count = entries_per_index_hopper(dir->depth);
if (entry_count == 1) {
*entry_bits = single_pde_hopper(*phys_allocs, depth);
*entry_bits = single_pde_hopper(*phys_allocs, dir, child_index);
}
else if (entry_count == 2) {
entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG]);
entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL]);
entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG], dir, child_index);
entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL], dir, child_index);
// This entry applies to the whole dual PDE but is stored in the lower
// bits
// bits.
entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER3, DUAL_PDE, IS_PTE, FALSE);
}
else {

View File

@@ -128,8 +128,9 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
// present if we see the callback.
//
// The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
// v3.19 (2014-11-13).
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
#define UVM_CAN_USE_MMU_NOTIFIERS() 1
#else
#define UVM_CAN_USE_MMU_NOTIFIERS() 0

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2021 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -71,4 +71,6 @@ void uvm_hal_maxwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = false;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -108,11 +108,14 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
static void make_pde_maxwell(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU64 pde_bits = 0;
UVM_ASSERT(depth == 0);
UVM_ASSERT(dir);
UVM_ASSERT(dir->depth == 0);
pde_bits |= HWCONST64(_MMU, PDE, SIZE, FULL);
pde_bits |= big_half_pde_maxwell(phys_allocs[MMU_BIG]) | small_half_pde_maxwell(phys_allocs[MMU_SMALL]);

View File

@@ -672,14 +672,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
.finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
};
// WAR for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
//
// This code path isn't used on GH180 but we need to maintain consistent
// behaviour on systems that do.
if (!vma_is_anonymous(args->vma))
return NV_WARN_NOTHING_TO_DO;
ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
if (ret < 0)
return errno_to_nv_status(ret);
@@ -693,24 +685,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
if (ret < 0)
return errno_to_nv_status(ret);
// TODO: Bug 2419180: support file-backed pages in migrate_vma, when
// support for it is added to the Linux kernel
//
// A side-effect of migrate_vma_setup() is it calls mmu notifiers even if a
// page can't be migrated (eg. because it's a non-anonymous mapping). We
// need this side-effect for SMMU on GH180 to ensure any cached read-only
// entries are flushed from SMMU on permission upgrade.
//
// TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
//
// The above WAR doesn't work for HugeTLBfs mappings because
// migrate_vma_setup() will fail in that case.
if (!vma_is_anonymous(args->vma)) {
migrate_vma_finalize(args);
return NV_WARN_NOTHING_TO_DO;
}
uvm_migrate_vma_alloc_and_copy(args, state);
if (state->status == NV_OK) {
migrate_vma_pages(args);
@@ -862,6 +836,17 @@ static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma,
return NV_OK;
}
NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
{
uvm_va_space_t *va_space = uvm_va_space_get(filp);
uvm_va_space_down_write(va_space);
va_space->test.skip_migrate_vma = params->skip;
uvm_va_space_up_write(va_space);
return NV_OK;
}
static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
unsigned long start,
unsigned long outer,
@@ -884,13 +869,12 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
start = max(start, vma->vm_start);
outer = min(outer, vma->vm_end);
// migrate_vma only supports anonymous VMAs. We check for those after
// calling migrate_vma_setup() to workaround Bug 4130089. We need to check
// for HugeTLB VMAs here because migrate_vma_setup() will return a fatal
// error for those.
// TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
if (is_vm_hugetlb_page(vma))
if (va_space->test.skip_migrate_vma)
return NV_WARN_NOTHING_TO_DO;
// TODO: Bug 2419180: support file-backed pages in migrate_vma, when
// support for it is added to the Linux kernel
if (!vma_is_anonymous(vma))
return NV_WARN_NOTHING_TO_DO;
if (uvm_processor_mask_empty(&va_space->registered_gpus))
@@ -950,7 +934,9 @@ static NV_STATUS migrate_pageable(migrate_vma_state_t *state)
bool touch = uvm_migrate_args->touch;
uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions;
UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus));
UVM_ASSERT(va_space->test.skip_migrate_vma ||
!vma_is_anonymous(vma) ||
uvm_processor_mask_empty(&va_space->registered_gpus));
// We can't use migrate_vma to move the pages as desired. Normally
// this fallback path is supposed to populate the memory then inform

View File

@@ -218,6 +218,9 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args);
NV_STATUS uvm_migrate_pageable_init(void);
void uvm_migrate_pageable_exit(void);
NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp);
#else // UVM_MIGRATE_VMA_SUPPORTED
static NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
@@ -251,6 +254,10 @@ static void uvm_migrate_pageable_exit(void)
{
}
static inline NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
{
return NV_OK;
}
#endif // UVM_MIGRATE_VMA_SUPPORTED
#endif

View File

@@ -338,7 +338,7 @@ static void pde_fill_cpu(uvm_page_tree_t *tree,
UVM_ASSERT(sizeof(pde_data) >= entry_size);
for (i = 0; i < pde_count; i++) {
tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i]);
tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i);
if (entry_size == sizeof(pde_data[0]))
uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
@@ -393,7 +393,7 @@ static void pde_fill_gpu(uvm_page_tree_t *tree,
uvm_push_inline_data_begin(push, &inline_data);
for (j = 0; j < entry_count; j++) {
tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i + j]);
tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i + j);
uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
}
inline_data_addr = uvm_push_inline_data_end(&inline_data);
@@ -407,9 +407,7 @@ static void pde_fill_gpu(uvm_page_tree_t *tree,
// pde_fill() populates pde_count PDE entries (starting at start_index) with
// the same mapping, i.e., with the same physical address (phys_addr).
// pde_fill() is optimized for pde_count == 1, which is the common case. The
// map_remap() function is the only case where pde_count > 1, only used on GA100
// GPUs for 512MB page size mappings.
// pde_fill() is optimized for pde_count == 1, which is the common case.
static void pde_fill(uvm_page_tree_t *tree,
uvm_page_directory_t *directory,
NvU32 start_index,
@@ -428,21 +426,26 @@ static void pde_fill(uvm_page_tree_t *tree,
static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
{
NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
NvU8 max_pde_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC) - 1;
// Passing in NULL for the phys_allocs will mark the child entries as
// invalid.
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
// Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
// than 512 entries. We initialize them all with the same clean PDE.
// Additionally, only ATS systems may require clean PDEs bit settings based
// on the mapping VA.
if (dir->depth == tree->hal->page_table_depth(page_size) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
// than 512 entries. In this case, we initialize them all with the same
// clean PDE. ATS systems may require clean PDEs with
// ATS_ALLOWED/ATS_NOT_ALLOWED bit settings based on the mapping VA.
// We only clean_bits to 0 at the lowest page table level (PTE table), i.e.,
// when depth is greater than the max_pde_depth.
if ((dir->depth > max_pde_depth) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
NvU64 clear_bits[2];
// If it is not a PTE, make a clean PDE.
if (dir->depth != tree->hal->page_table_depth(page_size)) {
tree->hal->make_pde(clear_bits, phys_allocs, dir->depth, dir->entries[0]);
// make_pde() child index is zero/ignored, since it is only used in
// PDEs on ATS-enabled systems where pde_fill() is preferred.
tree->hal->make_pde(clear_bits, phys_allocs, dir, 0);
// Make sure that using only clear_bits[0] will work.
UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
@@ -816,7 +819,6 @@ static void free_unused_directories(uvm_page_tree_t *tree,
}
}
}
}
static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
@@ -827,6 +829,86 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm
return phys_mem_allocate(tree, alloc_size, tree->location, UVM_PMM_ALLOC_FLAGS_EVICT, out);
}
static bool page_tree_ats_init_required(uvm_page_tree_t *tree)
{
// We have full control of the kernel page tables mappings, no ATS address
// aliases is expected.
if (tree->type == UVM_PAGE_TREE_TYPE_KERNEL)
return false;
// Enable uvm_page_tree_init() from the page_tree test.
if (uvm_enable_builtin_tests && tree->gpu_va_space == NULL)
return false;
if (!tree->gpu_va_space->ats.enabled)
return false;
return tree->gpu->parent->no_ats_range_required;
}
static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
{
NV_STATUS status;
NvU64 min_va_upper, max_va_lower;
NvU32 page_size;
if (!page_tree_ats_init_required(tree))
return NV_OK;
page_size = uvm_mmu_biggest_page_size(tree);
uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
// Potential violation of the UVM internal get/put_ptes contract. get_ptes()
// creates and initializes enough PTEs to populate all PDEs covering the
// no_ats_ranges. We store the no_ats_ranges in the tree, so they can be
// put_ptes()'ed on deinit(). It doesn't preclude the range to be used by a
// future get_ptes(), since we don't write to the PTEs (range->table) from
// the tree->no_ats_ranges.
//
// Lower half
status = uvm_page_tree_get_ptes(tree,
page_size,
max_va_lower,
page_size,
UVM_PMM_ALLOC_FLAGS_EVICT,
&tree->no_ats_ranges[0]);
if (status != NV_OK)
return status;
UVM_ASSERT(tree->no_ats_ranges[0].entry_count == 1);
if (uvm_platform_uses_canonical_form_address()) {
// Upper half
status = uvm_page_tree_get_ptes(tree,
page_size,
min_va_upper - page_size,
page_size,
UVM_PMM_ALLOC_FLAGS_EVICT,
&tree->no_ats_ranges[1]);
if (status != NV_OK)
return status;
UVM_ASSERT(tree->no_ats_ranges[1].entry_count == 1);
}
return NV_OK;
}
static void page_tree_ats_deinit(uvm_page_tree_t *tree)
{
size_t i;
if (page_tree_ats_init_required(tree)) {
for (i = 0; i < ARRAY_SIZE(tree->no_ats_ranges); i++) {
if (tree->no_ats_ranges[i].entry_count)
uvm_page_tree_put_ptes(tree, &tree->no_ats_ranges[i]);
}
memset(tree->no_ats_ranges, 0, sizeof(tree->no_ats_ranges));
}
}
static void map_remap_deinit(uvm_page_tree_t *tree)
{
if (tree->map_remap.pde0) {
@@ -1032,11 +1114,22 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
return status;
phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, &push);
return page_tree_end_and_wait(tree, &push);
status = page_tree_end_and_wait(tree, &push);
if (status != NV_OK)
return status;
status = page_tree_ats_init(tree);
if (status != NV_OK)
return status;
return NV_OK;
}
void uvm_page_tree_deinit(uvm_page_tree_t *tree)
{
page_tree_ats_deinit(tree);
UVM_ASSERT(tree->root->ref_count == 0);
// Take the tree lock only to avoid assertions. It is not required for
@@ -1275,7 +1368,6 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
UVM_ASSERT(uvm_gpu_can_address_kernel(tree->gpu, start, size));
while (true) {
// index of the entry, for the first byte of the range, within its
// containing directory
NvU32 start_index;
@@ -1307,7 +1399,8 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
if (dir_cache[dir->depth] == NULL) {
*cur_depth = dir->depth;
// Undo the changes to the tree so that the dir cache remains private to the thread
// Undo the changes to the tree so that the dir cache
// remains private to the thread.
for (i = 0; i < used_count; i++)
host_pde_clear(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, page_size);
@@ -1405,7 +1498,8 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
dir_cache)) == NV_ERR_MORE_PROCESSING_REQUIRED) {
uvm_mutex_unlock(&tree->lock);
// try_get_ptes never needs depth 0, so store a directory at its parent's depth
// try_get_ptes never needs depth 0, so store a directory at its
// parent's depth.
// TODO: Bug 1766655: Allocate everything below cur_depth instead of
// retrying for every level.
dir_cache[cur_depth] = allocate_directory(tree, page_size, cur_depth + 1, pmm_flags);
@@ -1688,8 +1782,12 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
range);
if (status != NV_OK) {
UVM_ERR_PRINT("Failed to get PTEs for subrange %zd [0x%llx, 0x%llx) size 0x%llx, part of [0x%llx, 0x%llx)\n",
i, range_start, range_start + range_size, range_size,
start, size);
i,
range_start,
range_start + range_size,
range_size,
start,
size);
goto out;
}
}

View File

@@ -215,11 +215,14 @@ struct uvm_mmu_mode_hal_struct
// memory out-of-range error so we can immediately identify bad PTE usage.
NvU64 (*poisoned_pte)(void);
// write a PDE bit-pattern to entry based on the data in entries (which may
// Write a PDE bit-pattern to entry based on the data in allocs (which may
// point to two items for dual PDEs).
// any of allocs are allowed to be NULL, in which case they are to be
// treated as empty.
void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth, uvm_page_directory_t *child_dir);
// Any of allocs are allowed to be NULL, in which case they are to be
// treated as empty. make_pde() uses dir and child_index to compute the
// mapping PDE VA. On ATS-enabled systems, we may set PDE's PCF as
// ATS_ALLOWED or ATS_NOT_ALLOWED based on the mapping PDE VA, even for
// invalid/clean PDE entries.
void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, uvm_page_directory_t *dir, NvU32 child_index);
// size of an entry in a directory/table. Generally either 8 or 16 bytes.
// (in the case of Pascal dual PDEs)
@@ -299,6 +302,12 @@ struct uvm_page_tree_struct
uvm_page_directory_t *pde0;
} map_remap;
// On ATS-enabled systems where the CPU VA width is smaller than the GPU VA
// width, the excess address range is set with ATS_NOT_ALLOWED on all leaf
// PDEs covering that range. We have at most 2 no_ats_ranges, due to
// canonical form address systems.
uvm_page_table_range_t no_ats_ranges[2];
// Tracker for all GPU operations on the tree
uvm_tracker_t tracker;
};

View File

@@ -1578,25 +1578,28 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
uvm_mmu_mode_hal_t *hal;
uvm_page_directory_t dir;
NvU32 i, j, big_page_size, page_size;
dir.depth = 0;
for (i = 0; i < ARRAY_SIZE(big_page_sizes); i++) {
big_page_size = big_page_sizes[i];
hal = gpu->parent->arch_hal->mmu_mode_hal(big_page_size);
memset(phys_allocs, 0, sizeof(phys_allocs));
hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits == 0x0L);
phys_allocs[0] = &alloc_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);
phys_allocs[0] = &alloc_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);
for (j = 0; j <= 2; j++) {
@@ -1666,6 +1669,7 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
uvm_page_directory_t dir;
// big versions have [11:8] set as well to test the page table merging
uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
@@ -1673,32 +1677,39 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
dir.index_in_parent = 0;
dir.host_parent = NULL;
dir.depth = 0;
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
// Sys and vidmem PDEs
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
dir.depth = 0;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);
// Dual PDEs
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);
// uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@@ -1754,6 +1765,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
uvm_page_directory_t dir;
// big versions have [11:8] set as well to test the page table merging
uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
@@ -1761,37 +1773,45 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
dir.index_in_parent = 0;
dir.host_parent = NULL;
dir.depth = 0;
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
// Sys and vidmem PDEs
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
dir.depth = 0;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);
// Dual PDEs
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);
// NO_ATS PDE1 (depth 2)
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 2, NULL);
dir.depth = 2;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
if (g_uvm_global.ats.enabled)
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
else
@@ -1826,104 +1846,203 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent
static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
{
NV_STATUS status = NV_OK;
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
NvU64 pde_bits[2];
uvm_page_directory_t *dirs[5];
size_t i, num_page_sizes;
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBB000LL);
// big versions have [11:8] set as well to test the page table merging
// Big versions have [11:8] set as well to test the page table merging
uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999900LL);
uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBBB00LL);
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(dirs, 0, sizeof(dirs));
// Fake directory tree.
for (i = 0; i < ARRAY_SIZE(dirs); i++) {
dirs[i] = uvm_kvmalloc_zero(sizeof(uvm_page_directory_t) + sizeof(dirs[i]->entries[0]) * 512);
TEST_CHECK_GOTO(dirs[i] != NULL, cleanup);
dirs[i]->depth = i;
dirs[i]->index_in_parent = 0;
if (i == 0)
dirs[i]->host_parent = NULL;
else
dirs[i]->host_parent = dirs[i - 1];
}
// Make sure cleared PDEs work as expected.
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0, cleanup);
// Cleared PDEs work as expected for big and small PDEs.
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 4, NULL);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0 && pde_bits[1] == 0, cleanup);
// Sys and vidmem PDEs, uncached ATS allowed.
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBB00A, cleanup);
// Dual PDEs, uncached.
// Dual PDEs, uncached. We don't use child_dir in the depth 4 checks because
// our policy decides the PDE's PCF without using it.
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 4, NULL);
TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);
hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
if (g_uvm_global.ats.enabled)
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A, cleanup);
else
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999990C && pde_bits[1] == 0xBBBBBBB00A, cleanup);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 4, NULL);
TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);
hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
if (g_uvm_global.ats.enabled)
TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C, cleanup);
else
TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB0A && pde_bits[1] == 0x999999999900C, cleanup);
// We only need to test make_pde() on ATS when the CPU VA width < GPU's.
if (g_uvm_global.ats.enabled && uvm_cpu_num_va_bits() < hal->num_va_bits()) {
phys_allocs[0] = &alloc_sys;
dirs[1]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
dirs[2]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 1;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 1);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 2;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 511;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 511);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[1]->index_in_parent = 1;
hal->make_pde(pde_bits, phys_allocs, dirs[0], 1);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
dirs[2]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 509;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 510;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
phys_allocs[0] = NULL;
dirs[1]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
dirs[2]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
dirs[2]->index_in_parent = 2;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
dirs[1]->index_in_parent = 1;
dirs[2]->index_in_parent = 509;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
dirs[2]->index_in_parent = 510;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
}
// uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
// access counters disabled.
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D,
cleanup);
// change to cached.
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
0x9999999999685);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
0x9999999999685,
cleanup);
// enable access counters.
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605,
cleanup);
// remove atomic
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645,
cleanup);
// read only
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665,
cleanup);
// local video
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_VID,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_VID,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661,
cleanup);
// peer 1
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_1,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_PEER_1,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663,
cleanup);
// sparse
TEST_CHECK_RET(hal->make_sparse_pte() == 0x8);
TEST_CHECK_GOTO(hal->make_sparse_pte() == 0x8, cleanup);
// sked reflected
TEST_CHECK_RET(hal->make_sked_reflected_pte() == 0xF09);
TEST_CHECK_GOTO(hal->make_sked_reflected_pte() == 0xF09, cleanup);
num_page_sizes = get_page_sizes(gpu, page_sizes);
for (i = 0; i < num_page_sizes; i++)
TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));
TEST_NV_CHECK_GOTO(entry_test_page_size(gpu, page_sizes[i]), cleanup);
return NV_OK;
cleanup:
for (i = 0; i < ARRAY_SIZE(dirs); i++)
uvm_kvfree(dirs[i]);
return status;
}
static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
@@ -2347,7 +2466,13 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
// calls.
TEST_NV_CHECK_GOTO(fake_tlb_invals_alloc(), done);
TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
// We prevent the maxwell_test_page_tree test from running on ATS-enabled
// systems. On "fake" Maxwell-based ATS systems pde_fill() may push more
// methods than what we support in UVM. Specifically, on
// uvm_page_tree_init() which eventually calls phys_mem_init(). On Maxwell,
// upper PDE levels have more than 512 entries.
if (!g_uvm_global.ats.enabled)
TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
TEST_NV_CHECK_GOTO(pascal_test_page_tree(gpu), done);
TEST_NV_CHECK_GOTO(volta_test_page_tree(gpu), done);
TEST_NV_CHECK_GOTO(ampere_test_page_tree(gpu), done);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2020 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -100,4 +100,6 @@ void uvm_hal_pascal_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = false;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -142,12 +142,16 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
static void make_pde_pascal(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU32 entry_count = entries_per_index_pascal(depth);
NvU32 entry_count;
NvU64 *entry_bits = (NvU64 *)entry;
UVM_ASSERT(dir);
entry_count = entries_per_index_pascal(dir->depth);
if (entry_count == 1) {
*entry_bits = single_pde_pascal(*phys_allocs);
}
@@ -155,7 +159,8 @@ static void make_pde_pascal(void *entry,
entry_bits[MMU_BIG] = big_half_pde_pascal(phys_allocs[MMU_BIG]);
entry_bits[MMU_SMALL] = small_half_pde_pascal(phys_allocs[MMU_SMALL]);
// This entry applies to the whole dual PDE but is stored in the lower bits
// This entry applies to the whole dual PDE but is stored in the lower
// bits.
entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER2, DUAL_PDE, IS_PDE, TRUE);
}
else {

View File

@@ -36,6 +36,7 @@
#include "uvm_mmu.h"
#include "uvm_gpu_access_counters.h"
#include "uvm_pmm_sysmem.h"
#include "uvm_migrate_pageable.h"
static NV_STATUS uvm_test_get_gpu_ref_count(UVM_TEST_GET_GPU_REF_COUNT_PARAMS *params, struct file *filp)
{
@@ -331,6 +332,7 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED, uvm_test_cgroup_accounting_supported);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SPLIT_INVALIDATE_DELAY, uvm_test_split_invalidate_delay);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CPU_CHUNK_API, uvm_test_cpu_chunk_api);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SKIP_MIGRATE_VMA, uvm_test_skip_migrate_vma);
}
return -EINVAL;

View File

@@ -28,6 +28,13 @@
#include "uvm_ioctl.h"
#include "nv_uvm_types.h"
#define UVM_TEST_SKIP_MIGRATE_VMA UVM_TEST_IOCTL_BASE(103)
typedef struct
{
NvBool skip; // In
NV_STATUS rmStatus; // Out
} UVM_TEST_SKIP_MIGRATE_VMA_PARAMS;
#ifdef __cplusplus
extern "C" {
#endif

View File

@@ -1082,25 +1082,19 @@ void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
}
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
NvU32 batch_id,
uvm_fault_client_type_t client_type)
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type)
{
UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);
if (!tools_is_event_enabled_in_any_va_space(UvmEventTypeGpuFaultReplay))
return;
record_replay_event_helper(gpu->id,
batch_id,
client_type,
NV_GETTIME(),
gpu->parent->host_hal->get_time(gpu));
record_replay_event_helper(gpu->id, batch_id, client_type, NV_GETTIME(), gpu->parent->host_hal->get_time(gpu));
}
void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *buffer_entry,
bool on_managed)
bool on_managed_phys)
{
UvmEventEntry entry;
UvmEventTestAccessCounterInfo *info = &entry.testEventData.accessCounter;
@@ -1119,6 +1113,7 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
info->srcIndex = uvm_id_value(gpu->id);
info->address = buffer_entry->address.address;
info->isVirtual = buffer_entry->address.is_virtual? 1: 0;
if (buffer_entry->address.is_virtual) {
info->instancePtr = buffer_entry->virtual_info.instance_ptr.address;
info->instancePtrAperture = g_hal_to_tools_aperture_table[buffer_entry->virtual_info.instance_ptr.aperture];
@@ -1126,9 +1121,10 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
}
else {
info->aperture = g_hal_to_tools_aperture_table[buffer_entry->address.aperture];
info->physOnManaged = on_managed_phys? 1 : 0;
}
info->isFromCpu = buffer_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MOMC? 1: 0;
info->onManaged = on_managed? 1 : 0;
info->value = buffer_entry->counter_value;
info->subGranularity = buffer_entry->sub_granularity;
info->bank = buffer_entry->bank;

View File

@@ -102,18 +102,13 @@ void uvm_tools_record_read_duplicate_invalidate(uvm_va_block_t *va_block,
uvm_va_block_region_t region,
const uvm_page_mask_t *page_mask);
void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
uvm_push_t *push,
NvU32 batch_id,
uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_replay(uvm_gpu_t *gpu, uvm_push_t *push, NvU32 batch_id, uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
NvU32 batch_id,
uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *buffer_entry,
bool on_managed);
bool on_managed_phys);
void uvm_tools_test_hmm_split_invalidate(uvm_va_space_t *va_space);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2017-2021 NVIDIA Corporation
Copyright (c) 2017-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -93,4 +93,6 @@ void uvm_hal_turing_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -967,8 +967,10 @@ typedef struct
NvU8 isFromCpu;
NvU8 veId;
NvU8 onManaged; // The access counter notification was triggered on
// a managed memory region
// The physical access counter notification was triggered on a managed
// memory region. This is not set for virtual access counter notifications.
NvU8 physOnManaged;
NvU32 value;
NvU32 subGranularity;

View File

@@ -1760,6 +1760,21 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
return (NvU32)chunk_size;
}
NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
uvm_processor_id_t processor,
uvm_page_index_t page_index)
{
block_phys_page_t page;
UVM_ASSERT(block);
uvm_assert_mutex_locked(&block->lock);
page = block_phys_page(processor, page_index);
return block_phys_page_size(block, page);
}
static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
{
uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
@@ -8248,14 +8263,6 @@ void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
event_data.block_munmap.region = region;
uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);
// Set a flag so that GPU fault events are flushed since they might refer
// to the region being unmapped.
// Note that holding the va_block lock prevents GPU VA spaces from
// being removed so the registered_gpu_va_spaces mask is stable.
for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
}
// Release any remaining vidmem chunks in the given region.
for_each_gpu_id(gpu_id) {
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
@@ -10172,7 +10179,11 @@ static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
// The logic that's used to detect remote faulting also keeps memory in place for
// ptrace accesses. We would prefer to control those policies separately, but the
// NIC case takes priority.
// If the accessing processor is CPU, we're either handling a fault
// from other than owning process, or we're handling an MOMC
// notification. Only prevent migration for the former.
if (UVM_ID_IS_CPU(processor_id) &&
operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
va_block_context->mm != current->mm) {
UVM_ASSERT(va_block_context->mm != NULL);

View File

@@ -2072,6 +2072,14 @@ void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
// Locking: The va_block lock must be held.
void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region);
// Get the size of the physical allocation backing the page at page_index on the
// specified processor in the block. Returns 0 if the address is not resident on
// the specified processor.
// Locking: The va_block lock must be held.
NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
uvm_processor_id_t processor,
uvm_page_index_t page_index);
// Get CPU page size or 0 if it is not mapped
NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
uvm_page_index_t page_index);

View File

@@ -1540,7 +1540,6 @@ static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);
uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu_va_space->gpu->id);
uvm_processor_mask_clear_atomic(&va_space->needs_fault_buffer_flush, gpu_va_space->gpu->id);
va_space->gpu_va_spaces[uvm_id_gpu_index(gpu_va_space->gpu->id)] = NULL;
gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
}

View File

@@ -253,17 +253,6 @@ struct uvm_va_space_struct
// corrupting state.
uvm_processor_mask_t gpu_unregister_in_progress;
// On VMA destruction, the fault buffer needs to be flushed for all the GPUs
// registered in the VA space to avoid leaving stale entries of the VA range
// that is going to be destroyed. Otherwise, these fault entries can be
// attributed to new VA ranges reallocated at the same addresses. However,
// uvm_vm_close is called with mm->mmap_lock taken and we cannot take the
// ISR lock. Therefore, we use a flag to notify the GPU fault handler that
// the fault buffer needs to be flushed, before servicing the faults that
// belong to the va_space. The bits are set and cleared atomically so no
// va_space lock is required.
uvm_processor_mask_t needs_fault_buffer_flush;
// Mask of processors that are participating in system-wide atomics
uvm_processor_mask_t system_wide_atomics_enabled_processors;
@@ -353,6 +342,7 @@ struct uvm_va_space_struct
struct
{
bool page_prefetch_enabled;
bool skip_migrate_vma;
atomic_t migrate_vma_allocation_fail_nth;

View File

@@ -215,7 +215,13 @@ bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)
static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
{
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
.invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
#elif defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
.arch_invalidate_secondary_tlbs = uvm_mmu_notifier_invalidate_range_ats,
#else
#error One of invalidate_range/arch_invalid_secondary must be present
#endif
};
static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2021 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -98,4 +98,6 @@ void uvm_hal_volta_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = false;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -147,14 +147,18 @@ static NvU64 small_half_pde_volta(uvm_mmu_page_table_alloc_t *phys_alloc)
static void make_pde_volta(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU32 entry_count = entries_per_index_volta(depth);
NvU32 entry_count;
NvU64 *entry_bits = (NvU64 *)entry;
UVM_ASSERT(dir);
entry_count = entries_per_index_volta(dir->depth);
if (entry_count == 1) {
*entry_bits = single_pde_volta(*phys_allocs, depth);
*entry_bits = single_pde_volta(*phys_allocs, dir->depth);
}
else if (entry_count == 2) {
entry_bits[MMU_BIG] = big_half_pde_volta(phys_allocs[MMU_BIG]);

View File

@@ -156,7 +156,7 @@ NvS32 NV_API_CALL nv_request_msix_irq(nv_linux_state_t *nvl)
{
for( j = 0; j < i; j++)
{
free_irq(nvl->msix_entries[i].vector, (void *)nvl);
free_irq(nvl->msix_entries[j].vector, (void *)nvl);
}
break;
}

View File

@@ -506,8 +506,13 @@ static int nv_p2p_get_pages(
(*page_table)->page_size = page_size_index;
os_free_mem(physical_addresses);
physical_addresses = NULL;
os_free_mem(wreqmb_h);
wreqmb_h = NULL;
os_free_mem(rreqmb_h);
rreqmb_h = NULL;
if (free_callback != NULL)
{