545.23.06

This commit is contained in:
Andy Ritger
2023-10-17 09:25:29 -07:00
parent f59818b751
commit b5bf85a8e3
917 changed files with 132480 additions and 110015 deletions

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2022 NVIDIA Corporation
Copyright (c) 2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2022 NVIDIA Corporation
Copyright (c) 2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to

View File

@@ -247,6 +247,11 @@ int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferr
return 0;
}
int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname)
{
return nv_kthread_q_init_on_node(q, qname, NV_KTHREAD_NO_NODE);
}
// Returns true (non-zero) if the item was actually scheduled, and false if the
// item was already pending in a queue.
static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item)

View File

@@ -27,6 +27,7 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rm_mem.c
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_channel.c
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_lock.c
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hal.c
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_processors.c
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_tree.c
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rb_tree.c
NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_allocator.c

View File

@@ -82,10 +82,12 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
NV_CONFTEST_FUNCTION_COMPILE_TESTS += vm_fault_to_errno
NV_CONFTEST_FUNCTION_COMPILE_TESTS += find_next_bit_wrap
NV_CONFTEST_TYPE_COMPILE_TESTS += backing_dev_info
NV_CONFTEST_TYPE_COMPILE_TESTS += mm_context_t
@@ -99,6 +101,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
@@ -113,4 +116,3 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup

View File

@@ -24,11 +24,11 @@
#include "nvstatus.h"
#if !defined(NV_PRINTF_STRING_SECTION)
#if defined(NVRM) && NVCPU_IS_RISCV64
#if defined(NVRM) && NVOS_IS_LIBOS
#define NV_PRINTF_STRING_SECTION __attribute__ ((section (".logging")))
#else // defined(NVRM) && NVCPU_IS_RISCV64
#else // defined(NVRM) && NVOS_IS_LIBOS
#define NV_PRINTF_STRING_SECTION
#endif // defined(NVRM) && NVCPU_IS_RISCV64
#endif // defined(NVRM) && NVOS_IS_LIBOS
#endif // !defined(NV_PRINTF_STRING_SECTION)
/*

View File

@@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
static void uvm_vm_close_managed(struct vm_area_struct *vma)
{
uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
uvm_processor_id_t gpu_id;
bool make_zombie = false;
if (current->mm != NULL)
@@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)
uvm_destroy_vma_managed(vma, make_zombie);
// Notify GPU address spaces that the fault buffer needs to be flushed to
// avoid finding stale entries that can be attributed to new VA ranges
// reallocated at the same address.
for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
}
uvm_va_space_up_write(va_space);
if (current->mm != NULL)

View File

@@ -216,6 +216,10 @@ NV_STATUS UvmDeinitialize(void);
// Note that it is not required to release VA ranges that were reserved with
// UvmReserveVa().
//
// This is useful for per-process checkpoint and restore, where kernel-mode
// state needs to be reconfigured to match the expectations of a pre-existing
// user-mode process.
//
// UvmReopen() closes the open file returned by UvmGetFileDescriptor() and
// replaces it with a new open file with the same name.
//

View File

@@ -114,6 +114,8 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
{
uvm_ats_fault_invalidate_t *ats_invalidate;
uvm_ats_smmu_invalidate_tlbs(gpu_va_space, addr, size);
if (client_type == UVM_FAULT_CLIENT_TYPE_GPC)
ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.replayable.ats_invalidate;
else
@@ -588,4 +590,3 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
return status;
}

View File

@@ -29,8 +29,12 @@
#include "uvm_va_space.h"
#include "uvm_va_space_mm.h"
#include <asm/io.h>
#include <linux/iommu.h>
#include <linux/mm_types.h>
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/mmu_context.h>
// linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
// reference required for the iommu_sva_bind_device() call. This header is not
@@ -46,17 +50,271 @@
#define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
#endif
// Base address of SMMU CMDQ-V for GSMMU0.
#define SMMU_CMDQV_BASE_ADDR(smmu_base) (smmu_base + 0x200000)
#define SMMU_CMDQV_BASE_LEN 0x00830000
// CMDQV configuration is done by firmware but we check status here.
#define SMMU_CMDQV_CONFIG 0x0
#define SMMU_CMDQV_CONFIG_CMDQV_EN BIT(0)
// Used to map a particular VCMDQ to a VINTF.
#define SMMU_CMDQV_CMDQ_ALLOC_MAP(vcmdq_id) (0x200 + 0x4 * (vcmdq_id))
#define SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC BIT(0)
// Shift for the field containing the index of the virtual interface
// owning the VCMDQ.
#define SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT 15
// Base address for the VINTF registers.
#define SMMU_VINTF_BASE_ADDR(cmdqv_base_addr, vintf_id) (cmdqv_base_addr + 0x1000 + 0x100 * (vintf_id))
// Virtual interface (VINTF) configuration registers. The WAR only
// works on baremetal so we need to configure ourselves as the
// hypervisor owner.
#define SMMU_VINTF_CONFIG 0x0
#define SMMU_VINTF_CONFIG_ENABLE BIT(0)
#define SMMU_VINTF_CONFIG_HYP_OWN BIT(17)
#define SMMU_VINTF_STATUS 0x0
#define SMMU_VINTF_STATUS_ENABLED BIT(0)
// Caclulates the base address for a particular VCMDQ instance.
#define SMMU_VCMDQ_BASE_ADDR(cmdqv_base_addr, vcmdq_id) (cmdqv_base_addr + 0x10000 + 0x80 * (vcmdq_id))
// SMMU command queue consumer index register. Updated by SMMU
// when commands are consumed.
#define SMMU_VCMDQ_CONS 0x0
// SMMU command queue producer index register. Updated by UVM when
// commands are added to the queue.
#define SMMU_VCMDQ_PROD 0x4
// Configuration register used to enable a VCMDQ.
#define SMMU_VCMDQ_CONFIG 0x8
#define SMMU_VCMDQ_CONFIG_ENABLE BIT(0)
// Status register used to check the VCMDQ is enabled.
#define SMMU_VCMDQ_STATUS 0xc
#define SMMU_VCMDQ_STATUS_ENABLED BIT(0)
// Base address offset for the VCMDQ registers.
#define SMMU_VCMDQ_CMDQ_BASE 0x10000
// Size of the command queue. Each command is 8 bytes and we can't
// have a command queue greater than one page.
#define SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE 9
#define SMMU_VCMDQ_CMDQ_ENTRIES (1UL << SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE)
// We always use VINTF63 for the WAR
#define VINTF 63
static void smmu_vintf_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
{
iowrite32(val, SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
}
static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)
{
return ioread32(SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
}
// We always use VCMDQ127 for the WAR
#define VCMDQ 127
void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
{
iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
{
return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
static void smmu_vcmdq_write64(void __iomem *smmu_cmdqv_base, int reg, NvU64 val)
{
iowrite64(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
// TLB invalidates on read-only to read-write upgrades
static NV_STATUS uvm_ats_smmu_war_init(uvm_parent_gpu_t *parent_gpu)
{
uvm_spin_loop_t spin;
NV_STATUS status;
unsigned long cmdqv_config;
void __iomem *smmu_cmdqv_base;
struct acpi_iort_node *node;
struct acpi_iort_smmu_v3 *iort_smmu;
node = *(struct acpi_iort_node **) dev_get_platdata(parent_gpu->pci_dev->dev.iommu->iommu_dev->dev->parent);
iort_smmu = (struct acpi_iort_smmu_v3 *) node->node_data;
smmu_cmdqv_base = ioremap(SMMU_CMDQV_BASE_ADDR(iort_smmu->base_address), SMMU_CMDQV_BASE_LEN);
if (!smmu_cmdqv_base)
return NV_ERR_NO_MEMORY;
parent_gpu->smmu_war.smmu_cmdqv_base = smmu_cmdqv_base;
cmdqv_config = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CONFIG);
if (!(cmdqv_config & SMMU_CMDQV_CONFIG_CMDQV_EN)) {
status = NV_ERR_OBJECT_NOT_FOUND;
goto out;
}
// Allocate SMMU CMDQ pages for WAR
parent_gpu->smmu_war.smmu_cmdq = alloc_page(NV_UVM_GFP_FLAGS | __GFP_ZERO);
if (!parent_gpu->smmu_war.smmu_cmdq) {
status = NV_ERR_NO_MEMORY;
goto out;
}
// Initialise VINTF for the WAR
smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, SMMU_VINTF_CONFIG_ENABLE | SMMU_VINTF_CONFIG_HYP_OWN);
UVM_SPIN_WHILE(!(smmu_vintf_read32(smmu_cmdqv_base, SMMU_VINTF_STATUS) & SMMU_VINTF_STATUS_ENABLED), &spin);
// Allocate VCMDQ to VINTF
iowrite32((VINTF << SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT) | SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC,
smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
BUILD_BUG_ON((SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 3) > PAGE_SHIFT);
smmu_vcmdq_write64(smmu_cmdqv_base, SMMU_VCMDQ_CMDQ_BASE,
page_to_phys(parent_gpu->smmu_war.smmu_cmdq) | SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONS, 0);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_PROD, 0);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, SMMU_VCMDQ_CONFIG_ENABLE);
UVM_SPIN_WHILE(!(smmu_vcmdq_read32(smmu_cmdqv_base, SMMU_VCMDQ_STATUS) & SMMU_VCMDQ_STATUS_ENABLED), &spin);
uvm_mutex_init(&parent_gpu->smmu_war.smmu_lock, UVM_LOCK_ORDER_LEAF);
parent_gpu->smmu_war.smmu_prod = 0;
parent_gpu->smmu_war.smmu_cons = 0;
return NV_OK;
out:
iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
parent_gpu->smmu_war.smmu_cmdqv_base = NULL;
return status;
}
static void uvm_ats_smmu_war_deinit(uvm_parent_gpu_t *parent_gpu)
{
void __iomem *smmu_cmdqv_base = parent_gpu->smmu_war.smmu_cmdqv_base;
NvU32 cmdq_alloc_map;
if (parent_gpu->smmu_war.smmu_cmdqv_base) {
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, 0);
cmdq_alloc_map = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
iowrite32(cmdq_alloc_map & SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC, smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, 0);
}
if (parent_gpu->smmu_war.smmu_cmdq)
__free_page(parent_gpu->smmu_war.smmu_cmdq);
if (parent_gpu->smmu_war.smmu_cmdqv_base)
iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
}
// The SMMU on ARM64 can run under different translation regimes depending on
// what features the OS and CPU variant support. The CPU for GH180 supports
// virtualisation extensions and starts the kernel at EL2 meaning SMMU operates
// under the NS-EL2-E2H translation regime. Therefore we need to use the
// TLBI_EL2_* commands which invalidate TLB entries created under this
// translation regime.
#define CMDQ_OP_TLBI_EL2_ASID 0x21;
#define CMDQ_OP_TLBI_EL2_VA 0x22;
#define CMDQ_OP_CMD_SYNC 0x46
// Use the same maximum as used for MAX_TLBI_OPS in the upstream
// kernel.
#define UVM_MAX_TLBI_OPS (1UL << (PAGE_SHIFT - 3))
#if UVM_ATS_SMMU_WAR_REQUIRED()
void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
uvm_parent_gpu_t *parent_gpu = gpu_va_space->gpu->parent;
struct {
NvU64 low;
NvU64 high;
} *vcmdq;
unsigned long vcmdq_prod;
NvU64 end;
uvm_spin_loop_t spin;
NvU16 asid;
if (!parent_gpu->smmu_war.smmu_cmdqv_base)
return;
asid = arm64_mm_context_get(mm);
vcmdq = kmap(parent_gpu->smmu_war.smmu_cmdq);
uvm_mutex_lock(&parent_gpu->smmu_war.smmu_lock);
vcmdq_prod = parent_gpu->smmu_war.smmu_prod;
// Our queue management is very simple. The mutex prevents multiple
// producers writing to the queue and all our commands require waiting for
// the queue to drain so we know it's empty. If we can't fit enough commands
// in the queue we just invalidate the whole ASID.
//
// The command queue is a cirular buffer with the MSB representing a wrap
// bit that must toggle on each wrap. See the SMMU architecture
// specification for more details.
//
// SMMU_VCMDQ_CMDQ_ENTRIES - 1 because we need to leave space for the
// CMD_SYNC.
if ((size >> PAGE_SHIFT) > min(UVM_MAX_TLBI_OPS, SMMU_VCMDQ_CMDQ_ENTRIES - 1)) {
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_ASID;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0;
vcmdq_prod++;
}
else {
for (end = addr + size; addr < end; addr += PAGE_SIZE) {
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_VA;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = addr & ~((1UL << 12) - 1);
vcmdq_prod++;
}
}
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_CMD_SYNC;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0x0;
vcmdq_prod++;
// MSB is the wrap bit
vcmdq_prod &= (1UL << (SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 1)) - 1;
parent_gpu->smmu_war.smmu_prod = vcmdq_prod;
smmu_vcmdq_write32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_PROD, parent_gpu->smmu_war.smmu_prod);
UVM_SPIN_WHILE(
(smmu_vcmdq_read32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_CONS) & GENMASK(19, 0)) != vcmdq_prod,
&spin);
uvm_mutex_unlock(&parent_gpu->smmu_war.smmu_lock);
kunmap(parent_gpu->smmu_war.smmu_cmdq);
arm64_mm_context_put(mm);
}
#endif
NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
{
int ret;
ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
if (ret)
return errno_to_nv_status(ret);
return errno_to_nv_status(ret);
if (UVM_ATS_SMMU_WAR_REQUIRED())
return uvm_ats_smmu_war_init(parent_gpu);
else
return NV_OK;
}
void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
{
if (UVM_ATS_SMMU_WAR_REQUIRED())
uvm_ats_smmu_war_deinit(parent_gpu);
iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
}

View File

@@ -53,6 +53,17 @@
#define UVM_ATS_SVA_SUPPORTED() 0
#endif
// If NV_ARCH_INVALIDATE_SECONDARY_TLBS is defined it means the upstream fix is
// in place so no need for the WAR from Bug 4130089: [GH180][r535] WAR for
// kernel not issuing SMMU TLB invalidates on read-only
#if defined(NV_ARCH_INVALIDATE_SECONDARY_TLBS)
#define UVM_ATS_SMMU_WAR_REQUIRED() 0
#elif NVCPU_IS_AARCH64
#define UVM_ATS_SMMU_WAR_REQUIRED() 1
#else
#define UVM_ATS_SMMU_WAR_REQUIRED() 0
#endif
typedef struct
{
int placeholder;
@@ -81,6 +92,17 @@ typedef struct
// LOCKING: None
void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
// TLB invalidates on read-only to read-write upgrades
#if UVM_ATS_SMMU_WAR_REQUIRED()
void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size);
#else
static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
}
#endif
#else
static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
{
@@ -111,6 +133,11 @@ typedef struct
{
}
static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
}
#endif // UVM_ATS_SVA_SUPPORTED
#endif // __UVM_ATS_SVA_H__

View File

@@ -2683,7 +2683,7 @@ static void init_channel_manager_conf(uvm_channel_manager_t *manager)
// caches vidmem (and sysmem), we place GPFIFO and GPPUT on sysmem to avoid
// cache thrash. The memory access latency is reduced, despite the required
// access through the bus, because no cache coherence message is exchanged.
if (uvm_gpu_is_coherent(gpu->parent)) {
if (uvm_parent_gpu_is_coherent(gpu->parent)) {
manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_SYS;
// On GPUs with limited ESCHED addressing range, e.g., Volta on P9, RM

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2013-2021 NVIDIA Corporation
Copyright (c) 2013-2023 NVIDIA Corporation
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
@@ -233,18 +233,6 @@ unsigned uvm_get_stale_thread_id(void)
return (unsigned)task_pid_vnr(current);
}
//
// A simple security rule for allowing access to UVM user space memory: if you
// are the same user as the owner of the memory, or if you are root, then you
// are granted access. The idea is to allow debuggers and profilers to work, but
// without opening up any security holes.
//
NvBool uvm_user_id_security_check(uid_t euidTarget)
{
return (NV_CURRENT_EUID() == euidTarget) ||
(UVM_ROOT_UID == euidTarget);
}
void on_uvm_test_fail(void)
{
(void)NULL;

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2013-2021 NVIDIA Corporation
Copyright (c) 2013-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -282,9 +282,6 @@ static inline void kmem_cache_destroy_safe(struct kmem_cache **ppCache)
}
}
static const uid_t UVM_ROOT_UID = 0;
typedef struct
{
NvU64 start_time_ns;
@@ -335,7 +332,6 @@ NV_STATUS errno_to_nv_status(int errnoCode);
int nv_status_to_errno(NV_STATUS status);
unsigned uvm_get_stale_process_id(void);
unsigned uvm_get_stale_thread_id(void);
NvBool uvm_user_id_security_check(uid_t euidTarget);
extern int uvm_enable_builtin_tests;

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2021-2023 NVIDIA Corporation
Copyright (c) 2021 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -54,26 +54,23 @@ bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu)
return uvm_conf_computing_get_mode(gpu->parent) == UVM_GPU_CONF_COMPUTE_MODE_HCC;
}
void uvm_conf_computing_check_parent_gpu(const uvm_parent_gpu_t *parent)
NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent)
{
uvm_gpu_t *first_gpu;
UvmGpuConfComputeMode cc, sys_cc;
uvm_gpu_t *first;
uvm_assert_mutex_locked(&g_uvm_global.global_lock);
// The Confidential Computing state of the GPU should match that of the
// system.
UVM_ASSERT(uvm_conf_computing_mode_enabled_parent(parent) == g_uvm_global.conf_computing_enabled);
// TODO: Bug 2844714: since we have no routine to traverse parent GPUs,
// find first child GPU and get its parent.
first_gpu = uvm_global_processor_mask_find_first_gpu(&g_uvm_global.retained_gpus);
if (first_gpu == NULL)
return;
first = uvm_global_processor_mask_find_first_gpu(&g_uvm_global.retained_gpus);
if (!first)
return NV_OK;
// All GPUs derive Confidential Computing status from their parent. By
// current policy all parent GPUs have identical Confidential Computing
// status.
UVM_ASSERT(uvm_conf_computing_get_mode(parent) == uvm_conf_computing_get_mode(first_gpu->parent));
sys_cc = uvm_conf_computing_get_mode(first->parent);
cc = uvm_conf_computing_get_mode(parent);
return cc == sys_cc ? NV_OK : NV_ERR_NOT_SUPPORTED;
}
static void dma_buffer_destroy_locked(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,

View File

@@ -60,8 +60,10 @@
// UVM_METHOD_SIZE * 2 * 10 = 80.
#define UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE 80
void uvm_conf_computing_check_parent_gpu(const uvm_parent_gpu_t *parent);
// All GPUs derive confidential computing status from their parent.
// By current policy all parent GPUs have identical confidential
// computing status.
NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent);
bool uvm_conf_computing_mode_enabled_parent(const uvm_parent_gpu_t *parent);
bool uvm_conf_computing_mode_enabled(const uvm_gpu_t *gpu);
bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu);

View File

@@ -71,6 +71,11 @@ static void uvm_unregister_callbacks(void)
}
}
static void sev_init(const UvmPlatformInfo *platform_info)
{
g_uvm_global.sev_enabled = platform_info->sevEnabled;
}
NV_STATUS uvm_global_init(void)
{
NV_STATUS status;
@@ -119,7 +124,8 @@ NV_STATUS uvm_global_init(void)
uvm_ats_init(&platform_info);
g_uvm_global.num_simulated_devices = 0;
g_uvm_global.conf_computing_enabled = platform_info.confComputingEnabled;
sev_init(&platform_info);
status = uvm_gpu_init();
if (status != NV_OK) {

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2023 NVIDIA Corporation
Copyright (c) 2015-2021 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -143,16 +143,11 @@ struct uvm_global_struct
struct page *page;
} unload_state;
// True if the VM has AMD's SEV, or equivalent HW security extensions such
// as Intel's TDX, enabled. The flag is always false on the host.
//
// This value moves in tandem with that of Confidential Computing in the
// GPU(s) in all supported configurations, so it is used as a proxy for the
// Confidential Computing state.
//
// This field is set once during global initialization (uvm_global_init),
// and can be read afterwards without acquiring any locks.
bool conf_computing_enabled;
// AMD Secure Encrypted Virtualization (SEV) status. True if VM has SEV
// enabled. This field is set once during global initialization
// (uvm_global_init), and can be read afterwards without acquiring any
// locks.
bool sev_enabled;
};
// Initialize global uvm state
@@ -238,10 +233,8 @@ static uvm_gpu_t *uvm_gpu_get_by_processor_id(uvm_processor_id_t id)
return gpu;
}
static uvmGpuSessionHandle uvm_gpu_session_handle(uvm_gpu_t *gpu)
static uvmGpuSessionHandle uvm_global_session_handle(void)
{
if (gpu->parent->smc.enabled)
return gpu->smc.rm_session_handle;
return g_uvm_global.rm_session_handle;
}

View File

@@ -99,8 +99,8 @@ static void fill_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_in
parent_gpu->system_bus.link_rate_mbyte_per_s = gpu_info->sysmemLinkRateMBps;
if (gpu_info->systemMemoryWindowSize > 0) {
// memory_window_end is inclusive but uvm_gpu_is_coherent() checks
// memory_window_end > memory_window_start as its condition.
// memory_window_end is inclusive but uvm_parent_gpu_is_coherent()
// checks memory_window_end > memory_window_start as its condition.
UVM_ASSERT(gpu_info->systemMemoryWindowSize > 1);
parent_gpu->system_bus.memory_window_start = gpu_info->systemMemoryWindowStart;
parent_gpu->system_bus.memory_window_end = gpu_info->systemMemoryWindowStart +
@@ -136,12 +136,12 @@ static NV_STATUS get_gpu_caps(uvm_gpu_t *gpu)
return status;
if (gpu_caps.numaEnabled) {
UVM_ASSERT(uvm_gpu_is_coherent(gpu->parent));
UVM_ASSERT(uvm_parent_gpu_is_coherent(gpu->parent));
gpu->mem_info.numa.enabled = true;
gpu->mem_info.numa.node_id = gpu_caps.numaNodeId;
}
else {
UVM_ASSERT(!uvm_gpu_is_coherent(gpu->parent));
UVM_ASSERT(!uvm_parent_gpu_is_coherent(gpu->parent));
}
return NV_OK;
@@ -1089,7 +1089,7 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
{
NV_STATUS status;
status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(g_uvm_global.rm_session_handle,
status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
gpu_info,
gpu_uuid,
&parent_gpu->rm_device,
@@ -1099,7 +1099,12 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
return status;
}
uvm_conf_computing_check_parent_gpu(parent_gpu);
status = uvm_conf_computing_init_parent_gpu(parent_gpu);
if (status != NV_OK) {
UVM_ERR_PRINT("Confidential computing: %s, GPU %s\n",
nvstatusToString(status), parent_gpu->name);
return status;
}
parent_gpu->pci_dev = gpu_platform_info->pci_dev;
parent_gpu->closest_cpu_numa_node = dev_to_node(&parent_gpu->pci_dev->dev);
@@ -1161,19 +1166,8 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
{
NV_STATUS status;
// Presently, an RM client can only subscribe to a single partition per
// GPU. Therefore, UVM needs to create several RM clients. For simplicity,
// and since P2P is not supported when SMC partitions are created, we
// create a client (session) per GPU partition.
if (gpu->parent->smc.enabled) {
UvmPlatformInfo platform_info;
status = uvm_rm_locked_call(nvUvmInterfaceSessionCreate(&gpu->smc.rm_session_handle, &platform_info));
if (status != NV_OK) {
UVM_ERR_PRINT("Creating RM session failed: %s\n", nvstatusToString(status));
return status;
}
status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_gpu_session_handle(gpu),
status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
gpu_info,
uvm_gpu_uuid(gpu),
&gpu->smc.rm_device,
@@ -1543,9 +1537,6 @@ static void deinit_gpu(uvm_gpu_t *gpu)
if (gpu->parent->smc.enabled) {
if (gpu->smc.rm_device != 0)
uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
if (gpu->smc.rm_session_handle != 0)
uvm_rm_locked_call_void(nvUvmInterfaceSessionDestroy(gpu->smc.rm_session_handle));
}
gpu->magic = 0;
@@ -2575,7 +2566,7 @@ static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);
uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_gpu_session_handle(gpu0), p2p_handle));
uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), p2p_handle));
UVM_ASSERT(uvm_gpu_get(gpu0->global_id) == gpu0);
UVM_ASSERT(uvm_gpu_get(gpu1->global_id) == gpu1);
@@ -2701,9 +2692,9 @@ uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_p
return id;
}
uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id1, const uvm_gpu_id_t gpu_id2)
uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1)
{
NvU32 table_index = uvm_gpu_peer_table_index(gpu_id1, gpu_id2);
NvU32 table_index = uvm_gpu_peer_table_index(gpu_id0, gpu_id1);
return &g_uvm_global.peers[table_index];
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2022 NVIDIA Corporation
Copyright (c) 2015-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -167,7 +167,7 @@ struct uvm_service_block_context_struct
} per_processor_masks[UVM_ID_MAX_PROCESSORS];
// State used by the VA block routines called by the servicing routine
uvm_va_block_context_t block_context;
uvm_va_block_context_t *block_context;
// Prefetch state hint
uvm_perf_prefetch_hint_t prefetch_hint;
@@ -263,7 +263,10 @@ struct uvm_fault_service_batch_context_struct
NvU32 num_coalesced_faults;
bool has_fatal_faults;
// One of the VA spaces in this batch which had fatal faults. If NULL, no
// faults were fatal. More than one VA space could have fatal faults, but we
// pick one to be the target of the cancel sequence.
uvm_va_space_t *fatal_va_space;
bool has_throttled_faults;
@@ -825,8 +828,6 @@ struct uvm_gpu_struct
{
NvU32 swizz_id;
uvmGpuSessionHandle rm_session_handle;
// RM device handle used in many of the UVM/RM APIs.
//
// Do not read this field directly, use uvm_gpu_device_handle instead.
@@ -1162,6 +1163,16 @@ struct uvm_parent_gpu_struct
NvU64 memory_window_start;
NvU64 memory_window_end;
} system_bus;
// WAR to issue ATS TLB invalidation commands ourselves.
struct
{
uvm_mutex_t smmu_lock;
struct page *smmu_cmdq;
void __iomem *smmu_cmdqv_base;
unsigned long smmu_prod;
unsigned long smmu_cons;
} smmu_war;
};
static const char *uvm_gpu_name(uvm_gpu_t *gpu)
@@ -1336,7 +1347,7 @@ static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);
// Calculates peer table index using GPU ids.
NvU32 uvm_gpu_peer_table_index(uvm_gpu_id_t gpu_id1, uvm_gpu_id_t gpu_id2);
NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
// Either retains an existing PCIe peer entry or creates a new one. In both
// cases the two GPUs are also each retained.
@@ -1355,7 +1366,7 @@ uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu
uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);
// Get the P2P capabilities between the gpus with the given indexes
uvm_gpu_peer_t *uvm_gpu_index_peer_caps(uvm_gpu_id_t gpu_id1, uvm_gpu_id_t gpu_id2);
uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
// Get the P2P capabilities between the given gpus
static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
@@ -1363,10 +1374,10 @@ static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t
return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id);
}
static bool uvm_gpus_are_nvswitch_connected(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
{
if (gpu1->parent->nvswitch_info.is_nvswitch_connected && gpu2->parent->nvswitch_info.is_nvswitch_connected) {
UVM_ASSERT(uvm_gpu_peer_caps(gpu1, gpu2)->link_type >= UVM_GPU_LINK_NVLINK_2);
if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) {
UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
return true;
}
@@ -1511,7 +1522,7 @@ bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
// addresses.
NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
static bool uvm_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
static bool uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
{
return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start;
}

View File

@@ -985,7 +985,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
return NV_OK;
if (uvm_processor_mask_test(&va_block->resident, processor))
residency_mask = uvm_va_block_resident_mask_get(va_block, processor);
residency_mask = uvm_va_block_resident_mask_get(va_block, processor, NUMA_NO_NODE);
else
residency_mask = NULL;
@@ -1036,8 +1036,8 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
// If the underlying VMA is gone, skip HMM migrations.
if (uvm_va_block_is_hmm(va_block)) {
status = uvm_hmm_find_vma(service_context->block_context.mm,
&service_context->block_context.hmm.vma,
status = uvm_hmm_find_vma(service_context->block_context->mm,
&service_context->block_context->hmm.vma,
address);
if (status == NV_ERR_INVALID_ADDRESS)
continue;
@@ -1048,7 +1048,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
policy = uvm_va_policy_get(va_block, address);
new_residency = uvm_va_block_select_residency(va_block,
&service_context->block_context,
service_context->block_context,
page_index,
processor,
uvm_fault_access_type_mask_bit(UVM_FAULT_ACCESS_TYPE_PREFETCH),
@@ -1083,7 +1083,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
// Remove pages that are already resident in the destination processors
for_each_id_in_mask(id, &update_processors) {
bool migrate_pages;
uvm_page_mask_t *residency_mask = uvm_va_block_resident_mask_get(va_block, id);
uvm_page_mask_t *residency_mask = uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE);
UVM_ASSERT(residency_mask);
migrate_pages = uvm_page_mask_andnot(&service_context->per_processor_masks[uvm_id_value(id)].new_residency,
@@ -1101,9 +1101,9 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
if (uvm_va_block_is_hmm(va_block)) {
status = NV_ERR_INVALID_ADDRESS;
if (service_context->block_context.mm) {
if (service_context->block_context->mm) {
status = uvm_hmm_find_policy_vma_and_outer(va_block,
&service_context->block_context.hmm.vma,
&service_context->block_context->hmm.vma,
first_page_index,
&policy,
&outer);
@@ -1206,7 +1206,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
service_context->num_retries = 0;
service_context->block_context.mm = mm;
service_context->block_context->mm = mm;
if (uvm_va_block_is_hmm(va_block)) {
uvm_hmm_service_context_init(service_context);

View File

@@ -292,6 +292,7 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
{
NV_STATUS status = NV_OK;
char kthread_name[TASK_COMM_LEN + 1];
uvm_va_block_context_t *block_context;
if (parent_gpu->replayable_faults_supported) {
status = uvm_gpu_fault_buffer_init(parent_gpu);
@@ -311,6 +312,12 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
if (!parent_gpu->isr.replayable_faults.stats.cpu_exec_count)
return NV_ERR_NO_MEMORY;
block_context = uvm_va_block_context_alloc(NULL);
if (!block_context)
return NV_ERR_NO_MEMORY;
parent_gpu->fault_buffer_info.replayable.block_service_context.block_context = block_context;
parent_gpu->isr.replayable_faults.handling = true;
snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u BH", uvm_id_value(parent_gpu->id));
@@ -333,6 +340,12 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
if (!parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count)
return NV_ERR_NO_MEMORY;
block_context = uvm_va_block_context_alloc(NULL);
if (!block_context)
return NV_ERR_NO_MEMORY;
parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context = block_context;
parent_gpu->isr.non_replayable_faults.handling = true;
snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u KC", uvm_id_value(parent_gpu->id));
@@ -356,6 +369,13 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
return status;
}
block_context = uvm_va_block_context_alloc(NULL);
if (!block_context)
return NV_ERR_NO_MEMORY;
parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context =
block_context;
nv_kthread_q_item_init(&parent_gpu->isr.access_counters.bottom_half_q_item,
access_counters_isr_bottom_half_entry,
parent_gpu);
@@ -410,6 +430,8 @@ void uvm_gpu_disable_isr(uvm_parent_gpu_t *parent_gpu)
void uvm_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
{
uvm_va_block_context_t *block_context;
// Return ownership to RM:
if (parent_gpu->isr.replayable_faults.was_handling) {
// No user threads could have anything left on
@@ -439,8 +461,18 @@ void uvm_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
// It is safe to deinitialize access counters even if they have not been
// successfully initialized.
uvm_gpu_deinit_access_counters(parent_gpu);
block_context =
parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context;
uvm_va_block_context_free(block_context);
}
if (parent_gpu->non_replayable_faults_supported) {
block_context = parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context;
uvm_va_block_context_free(block_context);
}
block_context = parent_gpu->fault_buffer_info.replayable.block_service_context.block_context;
uvm_va_block_context_free(block_context);
uvm_kvfree(parent_gpu->isr.replayable_faults.stats.cpu_exec_count);
uvm_kvfree(parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count);
uvm_kvfree(parent_gpu->isr.access_counters.stats.cpu_exec_count);

View File

@@ -370,7 +370,7 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
// Check logical permissions
status = uvm_va_block_check_logical_permissions(va_block,
&service_context->block_context,
service_context->block_context,
gpu->id,
uvm_va_block_cpu_page_index(va_block,
fault_entry->fault_address),
@@ -393,7 +393,7 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
// Compute new residency and update the masks
new_residency = uvm_va_block_select_residency(va_block,
&service_context->block_context,
service_context->block_context,
page_index,
gpu->id,
fault_entry->access_type_mask,
@@ -629,7 +629,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
uvm_gpu_va_space_t *gpu_va_space;
uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
uvm_va_block_context_t *va_block_context =
&gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
status = uvm_gpu_fault_entry_to_va_space(gpu, fault_entry, &va_space);
if (status != NV_OK) {
@@ -655,7 +655,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
// to remain valid until we release. If no mm is registered, we
// can only service managed faults, not ATS/HMM faults.
mm = uvm_va_space_mm_retain_lock(va_space);
va_block_context->mm = mm;
uvm_va_block_context_init(va_block_context, mm);
uvm_va_space_down_read(va_space);

View File

@@ -1180,7 +1180,11 @@ static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
fault_entry->replayable.cancel_va_mode = cancel_va_mode;
utlb->has_fatal_faults = true;
batch_context->has_fatal_faults = true;
if (!batch_context->fatal_va_space) {
UVM_ASSERT(fault_entry->va_space);
batch_context->fatal_va_space = fault_entry->va_space;
}
}
static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
@@ -1230,7 +1234,7 @@ static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
UvmEventFatalReason fatal_reason;
uvm_fault_cancel_va_mode_t cancel_va_mode;
uvm_fault_access_type_t ret = UVM_FAULT_ACCESS_TYPE_COUNT;
uvm_va_block_context_t *va_block_context = &service_block_context->block_context;
uvm_va_block_context_t *va_block_context = service_block_context->block_context;
perm_status = uvm_va_block_check_logical_permissions(va_block,
va_block_context,
@@ -1345,7 +1349,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
if (uvm_va_block_is_hmm(va_block)) {
policy = uvm_hmm_find_policy_end(va_block,
block_context->block_context.hmm.vma,
block_context->block_context->hmm.vma,
ordered_fault_cache[first_fault_index]->fault_address,
&end);
}
@@ -1469,7 +1473,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
// Compute new residency and update the masks
new_residency = uvm_va_block_select_residency(va_block,
&block_context->block_context,
block_context->block_context,
page_index,
gpu->id,
service_access_type_mask,
@@ -1511,8 +1515,8 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
++block_context->num_retries;
if (status == NV_OK && batch_context->has_fatal_faults)
status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);
if (status == NV_OK && batch_context->fatal_va_space)
status = uvm_va_block_set_cancel(va_block, block_context->block_context, gpu);
return status;
}
@@ -1860,7 +1864,7 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
uvm_va_block_t *va_block;
uvm_gpu_t *gpu = gpu_va_space->gpu;
uvm_va_block_context_t *va_block_context =
&gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[fault_index];
struct mm_struct *mm = va_block_context->mm;
NvU64 fault_address = current_entry->fault_address;
@@ -1937,14 +1941,198 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
return status;
}
// Called when a fault in the batch has been marked fatal. Flush the buffer
// under the VA and mmap locks to remove any potential stale fatal faults, then
// service all new faults for just that VA space and cancel those which are
// fatal. Faults in other VA spaces are replayed when done and will be processed
// when normal fault servicing resumes.
static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
NV_STATUS status = NV_OK;
NvU32 i;
uvm_va_space_t *va_space = batch_context->fatal_va_space;
uvm_gpu_va_space_t *gpu_va_space = NULL;
struct mm_struct *mm;
uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
uvm_va_block_context_t *va_block_context = service_context->block_context;
UVM_ASSERT(gpu->parent->replayable_faults_supported);
UVM_ASSERT(va_space);
// Perform the flush and re-fetch while holding the mmap_lock and the
// VA space lock. This avoids stale faults because it prevents any vma
// modifications (mmap, munmap, mprotect) from happening between the time HW
// takes the fault and we cancel it.
mm = uvm_va_space_mm_retain_lock(va_space);
uvm_va_block_context_init(va_block_context, mm);
uvm_va_space_down_read(va_space);
// We saw fatal faults in this VA space before. Flush while holding
// mmap_lock to make sure those faults come back (aren't stale).
//
// We need to wait until all old fault messages have arrived before
// flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
if (status != NV_OK)
goto done;
// Wait for the flush's replay to finish to give the legitimate faults a
// chance to show up in the buffer again.
status = uvm_tracker_wait(&replayable_faults->replay_tracker);
if (status != NV_OK)
goto done;
// We expect all replayed faults to have arrived in the buffer so we can re-
// service them. The replay-and-wait sequence above will ensure they're all
// in the HW buffer. When GSP owns the HW buffer, we also have to wait for
// GSP to copy all available faults from the HW buffer into the shadow
// buffer.
//
// TODO: Bug 2533557: This flush does not actually guarantee that GSP will
// copy over all faults.
status = hw_fault_buffer_flush_locked(gpu->parent);
if (status != NV_OK)
goto done;
// If there is no GPU VA space for the GPU, ignore all faults in the VA
// space. This can happen if the GPU VA space has been destroyed since we
// unlocked the VA space in service_fault_batch. That means the fatal faults
// are stale, because unregistering the GPU VA space requires preempting the
// context and detaching all channels in that VA space. Restart fault
// servicing from the top.
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
if (!gpu_va_space)
goto done;
// Re-parse the new faults
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_duplicate_faults = 0;
batch_context->num_replays = 0;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
if (status != NV_OK)
goto done;
// No more faults left. Either the previously-seen fatal entry was stale, or
// RM killed the context underneath us.
if (batch_context->num_cached_faults == 0)
goto done;
++batch_context->batch_id;
status = preprocess_fault_batch(gpu, batch_context);
if (status != NV_OK) {
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
// Another flush happened due to stale faults or a context-fatal
// error. The previously-seen fatal fault might not exist anymore,
// so restart fault servicing from the top.
status = NV_OK;
}
goto done;
}
// Search for the target VA space
for (i = 0; i < batch_context->num_coalesced_faults; i++) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
UVM_ASSERT(current_entry->va_space);
if (current_entry->va_space == va_space)
break;
}
while (i < batch_context->num_coalesced_faults) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
if (current_entry->va_space != va_space)
break;
// service_fault_batch_dispatch() doesn't expect unserviceable faults.
// Just cancel them directly.
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
if (status != NV_OK)
break;
++i;
}
else {
uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
NvU32 block_faults;
ats_invalidate->write_faults_in_batch = false;
uvm_hmm_service_context_init(service_context);
// Service all the faults that we can. We only really need to search
// for fatal faults, but attempting to service all is the easiest
// way to do that.
status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false);
if (status != NV_OK) {
// TODO: Bug 3900733: clean up locking in service_fault_batch().
// We need to drop lock and retry. That means flushing and
// starting over.
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
status = NV_OK;
break;
}
// Invalidate TLBs before cancel to ensure that fatal faults don't
// get stuck in HW behind non-fatal faults to the same line.
status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
if (status != NV_OK)
break;
while (block_faults-- > 0) {
current_entry = batch_context->ordered_fault_cache[i];
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
if (status != NV_OK)
break;
}
++i;
}
}
}
done:
uvm_va_space_up_read(va_space);
uvm_va_space_mm_release_unlock(va_space, mm);
if (status == NV_OK) {
// There are two reasons to flush the fault buffer here.
//
// 1) Functional. We need to replay both the serviced non-fatal faults
// and the skipped faults in other VA spaces. The former need to be
// restarted and the latter need to be replayed so the normal fault
// service mechanism can fetch and process them.
//
// 2) Performance. After cancelling the fatal faults, a flush removes
// any potential duplicated fault that may have been added while
// processing the faults in this batch. This flush also avoids doing
// unnecessary processing after the fatal faults have been cancelled,
// so all the rest are unlikely to remain after a replay because the
// context is probably in the process of dying.
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
}
return status;
}
// Scan the ordered view of faults and group them by different va_blocks
// (managed faults) and service faults for each va_block, in batch.
// Service non-managed faults one at a time as they are encountered during the
// scan.
//
// This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
// was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
// space
// Fatal faults are marked for later processing by the caller.
static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
fault_service_mode_t service_mode,
uvm_fault_service_batch_context_t *batch_context)
@@ -1959,7 +2147,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK;
uvm_service_block_context_t *service_context =
&gpu->parent->fault_buffer_info.replayable.block_service_context;
uvm_va_block_context_t *va_block_context = &service_context->block_context;
uvm_va_block_context_t *va_block_context = service_context->block_context;
UVM_ASSERT(gpu->parent->replayable_faults_supported);
@@ -1995,41 +2183,28 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
// to remain valid until we release. If no mm is registered, we
// can only service managed faults, not ATS/HMM faults.
mm = uvm_va_space_mm_retain_lock(va_space);
va_block_context->mm = mm;
uvm_va_block_context_init(va_block_context, mm);
uvm_va_space_down_read(va_space);
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
if (status == NV_OK)
status = NV_WARN_MORE_PROCESSING_REQUIRED;
break;
}
// The case where there is no valid GPU VA space for the GPU in this
// VA space is handled next
}
// Some faults could be already fatal if they cannot be handled by
// the UVM driver
if (current_entry->is_fatal) {
++i;
batch_context->has_fatal_faults = true;
if (!batch_context->fatal_va_space)
batch_context->fatal_va_space = va_space;
utlb->has_fatal_faults = true;
UVM_ASSERT(utlb->num_pending_faults > 0);
continue;
}
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
if (!gpu_va_space) {
// If there is no GPU VA space for the GPU, ignore the fault. This
// can happen if a GPU VA space is destroyed without explicitly
// freeing all memory ranges (destroying the VA range triggers a
// flush of the fault buffer) and there are stale entries in the
// freeing all memory ranges and there are stale entries in the
// buffer that got fixed by the servicing in a previous batch.
++i;
continue;
@@ -2057,7 +2232,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
i += block_faults;
// Don't issue replays in cancel mode
if (replay_per_va_block && !batch_context->has_fatal_faults) {
if (replay_per_va_block && !batch_context->fatal_va_space) {
status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
if (status != NV_OK)
goto fail;
@@ -2069,8 +2244,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
}
}
// Only clobber status if invalidate_status != NV_OK, since status may also
// contain NV_WARN_MORE_PROCESSING_REQUIRED.
if (va_space != NULL) {
NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
if (invalidate_status != NV_OK)
@@ -2278,64 +2451,6 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
return false;
}
// Cancel just the faults flagged as fatal in the given fault service batch
// context.
static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
NV_STATUS status = NV_OK;
NV_STATUS fault_status;
uvm_va_space_t *va_space = NULL;
NvU32 i;
UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
UVM_ASSERT(current_entry->va_space);
if (current_entry->va_space != va_space) {
// Fault on a different va_space, drop the lock of the old one...
if (va_space != NULL)
uvm_va_space_up_read(va_space);
va_space = current_entry->va_space;
// ... and take the lock of the new one
uvm_va_space_down_read(va_space);
// We don't need to check whether a buffer flush is required
// (due to VA range destruction). Once a fault is flagged as fatal
// we need to cancel it, even if its VA range no longer exists.
}
// See the comment for the same check in cancel_faults_all
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id))
continue;
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
if (status != NV_OK)
break;
}
}
if (va_space != NULL)
uvm_va_space_up_read(va_space);
// See the comment on flushing in cancel_faults_all
fault_status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
// We report the first encountered error.
if (status == NV_OK)
status = fault_status;
return status;
}
// Cancel all faults in the given fault service batch context, even those not
// marked as fatal.
static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
@@ -2344,56 +2459,51 @@ static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
{
NV_STATUS status = NV_OK;
NV_STATUS fault_status;
uvm_va_space_t *va_space = NULL;
NvU32 i;
NvU32 i = 0;
UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
while (i < batch_context->num_coalesced_faults && status == NV_OK) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
uvm_fault_cancel_va_mode_t cancel_va_mode;
uvm_va_space_t *va_space = current_entry->va_space;
bool skip_va_space;
UVM_ASSERT(current_entry->va_space);
UVM_ASSERT(va_space);
if (current_entry->va_space != va_space) {
// Fault on a different va_space, drop the lock of the old one...
if (va_space != NULL)
uvm_va_space_up_read(va_space);
uvm_va_space_down_read(va_space);
va_space = current_entry->va_space;
// If there is no GPU VA space for the GPU, ignore all faults in
// that VA space. This can happen if the GPU VA space has been
// destroyed since we unlocked the VA space in service_fault_batch.
// Ignoring the fault avoids targetting a PDB that might have been
// reused by another process.
skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
// ... and take the lock of the new one
uvm_va_space_down_read(va_space);
for (;
i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
current_entry = batch_context->ordered_fault_cache[++i]) {
uvm_fault_cancel_va_mode_t cancel_va_mode;
if (skip_va_space)
continue;
if (current_entry->is_fatal) {
UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
cancel_va_mode = current_entry->replayable.cancel_va_mode;
}
else {
current_entry->fatal_reason = reason;
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
}
status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
if (status != NV_OK)
break;
}
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
// If there is no GPU VA space for the GPU, ignore the fault.
// This can happen if the GPU VA did not exist in
// service_fault_batch(), or it was destroyed since then.
// This is to avoid targetting a PDB that might have been reused
// by another process.
continue;
}
// If the fault was already marked fatal, use its reason and cancel
// mode. Otherwise use the provided reason.
if (current_entry->is_fatal) {
UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
cancel_va_mode = current_entry->replayable.cancel_va_mode;
}
else {
current_entry->fatal_reason = reason;
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
}
status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
if (status != NV_OK)
break;
}
if (va_space != NULL)
uvm_va_space_up_read(va_space);
}
// Because each cancel itself triggers a replay, there may be a large number
// of new duplicated faults in the buffer after cancelling all the known
@@ -2537,7 +2647,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_replays = 0;
batch_context->has_fatal_faults = false;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
// 5) Fetch all faults from buffer
@@ -2584,9 +2694,6 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
// 8) Service all non-fatal faults and mark all non-serviceable faults
// as fatal
status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
continue;
UVM_ASSERT(batch_context->num_replays == 0);
if (status == NV_ERR_NO_MEMORY)
continue;
@@ -2594,7 +2701,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
break;
// No more fatal faults left, we are done
if (!batch_context->has_fatal_faults)
if (!batch_context->fatal_va_space)
break;
// 9) Search for uTLBs that contain fatal faults and meet the
@@ -2616,9 +2723,9 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
UVM_ASSERT(batch_context->has_fatal_faults);
UVM_ASSERT(batch_context->fatal_va_space);
if (gpu->parent->fault_cancel_va_supported)
return cancel_faults_precise_va(gpu, batch_context);
return service_fault_batch_for_cancel(gpu, batch_context);
return cancel_faults_precise_tlb(gpu, batch_context);
}
@@ -2674,7 +2781,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_duplicate_faults = 0;
batch_context->num_replays = 0;
batch_context->has_fatal_faults = false;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
@@ -2702,9 +2809,6 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
// was flushed
num_replays += batch_context->num_replays;
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
continue;
enable_disable_prefetch_faults(gpu->parent, batch_context);
if (status != NV_OK) {
@@ -2718,10 +2822,17 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
break;
}
if (batch_context->has_fatal_faults) {
if (batch_context->fatal_va_space) {
status = uvm_tracker_wait(&batch_context->tracker);
if (status == NV_OK)
if (status == NV_OK) {
status = cancel_faults_precise(gpu, batch_context);
if (status == NV_OK) {
// Cancel handling should've issued at least one replay
UVM_ASSERT(batch_context->num_replays > 0);
++num_batches;
continue;
}
}
break;
}

View File

@@ -794,7 +794,7 @@ uvm_membar_t uvm_hal_downgrade_membar_type(uvm_gpu_t *gpu, bool is_local_vidmem)
// memory, including those from other processors like the CPU or peer GPUs,
// must come through this GPU's L2. In all current architectures, MEMBAR_GPU
// is sufficient to resolve ordering at the L2 level.
if (is_local_vidmem && !uvm_gpu_is_coherent(gpu->parent) && !uvm_downgrade_force_membar_sys)
if (is_local_vidmem && !uvm_parent_gpu_is_coherent(gpu->parent) && !uvm_downgrade_force_membar_sys)
return UVM_MEMBAR_GPU;
// If the mapped memory was remote, or if a coherence protocol can cache

View File

@@ -60,6 +60,8 @@ module_param(uvm_disable_hmm, bool, 0444);
#include "uvm_gpu.h"
#include "uvm_pmm_gpu.h"
#include "uvm_hal_types.h"
#include "uvm_push.h"
#include "uvm_hal.h"
#include "uvm_va_block_types.h"
#include "uvm_va_space_mm.h"
#include "uvm_va_space.h"
@@ -110,20 +112,7 @@ typedef struct
bool uvm_hmm_is_enabled_system_wide(void)
{
if (uvm_disable_hmm)
return false;
if (g_uvm_global.ats.enabled)
return false;
// Confidential Computing and HMM impose mutually exclusive constraints. In
// Confidential Computing the GPU can only access pages resident in vidmem,
// but in HMM pages may be required to be resident in sysmem: file backed
// VMAs, huge pages, etc.
if (g_uvm_global.conf_computing_enabled)
return false;
return uvm_va_space_mm_enabled_system();
return !uvm_disable_hmm && !g_uvm_global.ats.enabled && uvm_va_space_mm_enabled_system();
}
bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
@@ -140,6 +129,100 @@ static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node)
return container_of(node, uvm_va_block_t, hmm.node);
}
// Copies the contents of the source device-private page to the
// destination CPU page. This will invalidate mappings, so cannot be
// called while holding any va_block locks.
static NV_STATUS uvm_hmm_copy_devmem_page(struct page *dst_page, struct page *src_page, uvm_tracker_t *tracker)
{
uvm_gpu_phys_address_t src_addr;
uvm_gpu_phys_address_t dst_addr;
uvm_gpu_chunk_t *gpu_chunk;
NvU64 dma_addr;
uvm_push_t push;
NV_STATUS status = NV_OK;
uvm_gpu_t *gpu;
// Holding a reference on the device-private page ensures the gpu
// is already retained. This is because when a GPU is unregistered
// all device-private pages are migrated back to the CPU and freed
// before releasing the GPU. Therefore if we could get a reference
// to the page the GPU must be retained.
UVM_ASSERT(is_device_private_page(src_page) && page_count(src_page));
gpu_chunk = uvm_pmm_devmem_page_to_chunk(src_page);
gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
status = uvm_mmu_chunk_map(gpu_chunk);
if (status != NV_OK)
return status;
status = uvm_gpu_map_cpu_pages(gpu->parent, dst_page, PAGE_SIZE, &dma_addr);
if (status != NV_OK)
goto out_unmap_gpu;
dst_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
src_addr = uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_chunk->address);
status = uvm_push_begin_acquire(gpu->channel_manager,
UVM_CHANNEL_TYPE_GPU_TO_CPU,
tracker,
&push,
"Copy for remote process fault");
if (status != NV_OK)
goto out_unmap_cpu;
gpu->parent->ce_hal->memcopy(&push,
uvm_gpu_address_copy(gpu, dst_addr),
uvm_gpu_address_copy(gpu, src_addr),
PAGE_SIZE);
uvm_push_end(&push);
status = uvm_tracker_add_push_safe(tracker, &push);
out_unmap_cpu:
uvm_gpu_unmap_cpu_pages(gpu->parent, dma_addr, PAGE_SIZE);
out_unmap_gpu:
uvm_mmu_chunk_unmap(gpu_chunk, NULL);
return status;
}
static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
{
unsigned long src_pfn = 0;
unsigned long dst_pfn = 0;
struct page *dst_page;
NV_STATUS status = NV_OK;
int ret;
ret = migrate_device_range(&src_pfn, pfn, 1);
if (ret)
return errno_to_nv_status(ret);
if (src_pfn & MIGRATE_PFN_MIGRATE) {
uvm_tracker_t tracker = UVM_TRACKER_INIT();
dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
if (!dst_page) {
status = NV_ERR_NO_MEMORY;
goto out;
}
lock_page(dst_page);
if (WARN_ON(uvm_hmm_copy_devmem_page(dst_page, migrate_pfn_to_page(src_pfn), &tracker) != NV_OK))
memzero_page(dst_page, 0, PAGE_SIZE);
dst_pfn = migrate_pfn(page_to_pfn(dst_page));
migrate_device_pages(&src_pfn, &dst_pfn, 1);
uvm_tracker_wait_deinit(&tracker);
}
out:
migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
status = NV_ERR_BUSY_RETRY;
return status;
}
void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
{
uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
@@ -199,6 +282,9 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
{
uvm_range_tree_node_t *node;
uvm_va_block_t *va_block;
struct range range = gpu->pmm.devmem.pagemap.range;
unsigned long pfn;
bool retry;
if (!uvm_hmm_is_enabled(va_space))
return;
@@ -207,6 +293,29 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
// There could be pages with page->zone_device_data pointing to the va_space
// which may be about to be freed. Migrate those back to the CPU so we don't
// fault on them. Normally infinite retries are bad, but we don't have any
// option here. Device-private pages can't be pinned so migration should
// eventually succeed. Even if we did eventually bail out of the loop we'd
// just stall in memunmap_pages() anyway.
do {
retry = false;
for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
struct page *page = pfn_to_page(pfn);
UVM_ASSERT(is_device_private_page(page));
// This check is racy because nothing stops the page being freed and
// even reused. That doesn't matter though - worst case the
// migration fails, we retry and find the va_space doesn't match.
if (page->zone_device_data == va_space)
if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
retry = true;
}
} while (retry);
uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
va_block = hmm_va_block_from_node(node);
@@ -568,7 +677,7 @@ bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context)
{
// TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
service_context->block_context.hmm.swap_cached = false;
service_context->block_context->hmm.swap_cached = false;
}
NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
@@ -631,47 +740,6 @@ static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
return status;
}
void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
{
// We can't use uvm_va_space_mm_retain(), because the va_space_mm
// should already be dead by now.
struct mm_struct *mm = va_space->va_space_mm.mm;
uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
uvm_range_tree_node_t *node, *next;
uvm_va_block_t *va_block;
uvm_va_block_context_t *block_context;
uvm_down_read_mmap_lock(mm);
uvm_va_space_down_write(va_space);
uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
uvm_va_block_region_t region;
struct vm_area_struct *vma;
va_block = hmm_va_block_from_node(node);
block_context = uvm_va_space_block_context(va_space, mm);
uvm_hmm_migrate_begin_wait(va_block);
uvm_mutex_lock(&va_block->lock);
for_each_va_block_vma_region(va_block, mm, vma, &region) {
if (!uvm_hmm_vma_is_valid(vma, vma->vm_start, false))
continue;
block_context->hmm.vma = vma;
uvm_hmm_va_block_migrate_locked(va_block,
NULL,
block_context,
UVM_ID_CPU,
region,
UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
}
uvm_mutex_unlock(&va_block->lock);
uvm_hmm_migrate_finish(va_block);
}
uvm_va_space_up_write(va_space);
uvm_up_read_mmap_lock(mm);
}
NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
{
uvm_va_block_test_t *block_test;
@@ -1476,40 +1544,59 @@ static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
return status;
}
status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, page_index);
status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, chunk, page_index);
if (status != NV_OK) {
uvm_cpu_chunk_remove_from_block(va_block, page_index);
uvm_cpu_chunk_remove_from_block(va_block, page_to_nid(page), page_index);
uvm_cpu_chunk_free(chunk);
}
return status;
}
static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block,
uvm_page_index_t page_index)
static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
uvm_cpu_chunk_t *chunk,
int chunk_nid,
uvm_page_index_t page_index)
{
uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
if (!chunk)
return;
UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
!uvm_page_mask_test(&va_block->cpu.resident, page_index));
!uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);
uvm_cpu_chunk_remove_from_block(va_block, page_index);
uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
uvm_cpu_chunk_free(chunk);
}
static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, uvm_page_index_t page_index, struct page *page)
{
uvm_cpu_chunk_t *chunk;
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
if (page) {
chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index);
hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, page_to_nid(page), page_index);
}
else {
int nid;
for_each_possible_uvm_node(nid) {
chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, nid, page_index);
}
}
}
static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
uvm_page_index_t page_index,
struct page *page)
{
struct page *old_page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
struct page *old_page = uvm_va_block_get_cpu_page(va_block, page_index);
UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_index)));
UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index)));
return old_page == page;
}
@@ -1522,7 +1609,7 @@ static void clear_service_context_masks(uvm_service_block_context_t *service_con
uvm_processor_id_t new_residency,
uvm_page_index_t page_index)
{
uvm_page_mask_clear(&service_context->block_context.caller_page_mask, page_index);
uvm_page_mask_clear(&service_context->block_context->caller_page_mask, page_index);
uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency,
page_index);
@@ -1549,7 +1636,6 @@ static void cpu_mapping_set(uvm_va_block_t *va_block,
uvm_page_index_t page_index)
{
uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU);
uvm_page_mask_set(&va_block->maybe_mapped_pages, page_index);
uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
if (is_write)
uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
@@ -1699,7 +1785,7 @@ static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
// migrate_vma_finalize() will release the reference so we should
// clear our pointer to it.
// TODO: Bug 3660922: Need to handle read duplication at some point.
hmm_va_block_cpu_page_unpopulate(va_block, page_index);
hmm_va_block_cpu_page_unpopulate(va_block, page_index, page);
}
}
@@ -1725,7 +1811,7 @@ static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
else {
UVM_ASSERT(page_ref_count(dst_page) == 1);
hmm_va_block_cpu_page_unpopulate(va_block, page_index);
hmm_va_block_cpu_page_unpopulate(va_block, page_index, dst_page);
}
unlock_page(dst_page);
@@ -1760,7 +1846,7 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
unsigned long *dst_pfns,
uvm_page_mask_t *same_devmem_page_mask)
{
uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(src_page), page_index);
uvm_va_block_region_t chunk_region;
struct page *dst_page;
@@ -1786,7 +1872,7 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
// hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
// does not migrate, it will be freed though.
UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
!uvm_page_mask_test(&va_block->cpu.resident, page_index));
!uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
UVM_ASSERT(page_ref_count(dst_page) == 1);
uvm_cpu_chunk_make_hmm(chunk);
@@ -1934,7 +2020,7 @@ static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block,
}
UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
!uvm_page_mask_test(&va_block->cpu.resident, page_index));
!uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
// Allocate a user system memory page for the destination.
// This is the typical case since Linux will free the source page when
@@ -2012,8 +2098,8 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
service_context = devmem_fault_context->service_context;
va_block_retry = devmem_fault_context->va_block_retry;
va_block = devmem_fault_context->va_block;
src_pfns = service_context->block_context.hmm.src_pfns;
dst_pfns = service_context->block_context.hmm.dst_pfns;
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
// Build the migration page mask.
// Note that thrashing pinned pages and prefetch pages are already
@@ -2022,7 +2108,7 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);
status = alloc_and_copy_to_cpu(va_block,
service_context->block_context.hmm.vma,
service_context->block_context->hmm.vma,
src_pfns,
dst_pfns,
service_context->region,
@@ -2057,8 +2143,8 @@ static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_cont
prefetch_hint = &service_context->prefetch_hint;
va_block = devmem_fault_context->va_block;
va_block_retry = devmem_fault_context->va_block_retry;
src_pfns = service_context->block_context.hmm.src_pfns;
dst_pfns = service_context->block_context.hmm.dst_pfns;
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
region = service_context->region;
page_mask = &devmem_fault_context->page_mask;
@@ -2165,8 +2251,7 @@ static NV_STATUS populate_region(uvm_va_block_t *va_block,
// Since we have a stable snapshot of the CPU pages, we can
// update the residency and protection information.
uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
uvm_page_mask_set(&va_block->cpu.resident, page_index);
uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
}
@@ -2253,7 +2338,7 @@ static void hmm_release_atomic_pages(uvm_va_block_t *va_block,
uvm_page_index_t page_index;
for_each_va_block_page_in_region(page_index, region) {
struct page *page = service_context->block_context.hmm.pages[page_index];
struct page *page = service_context->block_context->hmm.pages[page_index];
if (!page)
continue;
@@ -2269,14 +2354,14 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
uvm_service_block_context_t *service_context)
{
uvm_va_block_region_t region = service_context->region;
struct page **pages = service_context->block_context.hmm.pages;
struct page **pages = service_context->block_context->hmm.pages;
int npages;
uvm_page_index_t page_index;
uvm_make_resident_cause_t cause;
NV_STATUS status;
if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
!uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
!uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
// There is an atomic GPU fault. We need to make sure no pages are
// GPU resident so that make_device_exclusive_range() doesn't call
// migrate_to_ram() and cause a va_space lock recursion problem.
@@ -2289,7 +2374,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
status = uvm_hmm_va_block_migrate_locked(va_block,
va_block_retry,
&service_context->block_context,
service_context->block_context,
UVM_ID_CPU,
region,
cause);
@@ -2299,7 +2384,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
// make_device_exclusive_range() will try to call migrate_to_ram()
// and deadlock with ourself if the data isn't CPU resident.
if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
!uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
!uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
status = NV_WARN_MORE_PROCESSING_REQUIRED;
goto done;
}
@@ -2309,7 +2394,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
// mmap() files so we check for that here and report a fatal fault.
// Otherwise with the current Linux 6.1 make_device_exclusive_range(),
// it doesn't make the page exclusive and we end up in an endless loop.
if (service_context->block_context.hmm.vma->vm_flags & VM_SHARED) {
if (service_context->block_context->hmm.vma->vm_flags & (VM_SHARED | VM_HUGETLB)) {
status = NV_ERR_NOT_SUPPORTED;
goto done;
}
@@ -2318,7 +2403,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
uvm_mutex_unlock(&va_block->lock);
npages = make_device_exclusive_range(service_context->block_context.mm,
npages = make_device_exclusive_range(service_context->block_context->mm,
uvm_va_block_cpu_page_address(va_block, region.first),
uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE,
pages + region.first,
@@ -2356,15 +2441,13 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
}
else {
NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);
if (s == NV_OK) {
uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
uvm_page_mask_set(&va_block->cpu.resident, page_index);
}
if (s == NV_OK)
uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
}
cpu_mapping_clear(va_block, page_index);
@@ -2419,7 +2502,7 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
uvm_service_block_context_t *service_context)
{
uvm_va_block_region_t region = service_context->region;
struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
NV_STATUS status;
int ret;
uvm_hmm_devmem_fault_context_t fault_context = {
@@ -2453,8 +2536,8 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
}
status = hmm_make_resident_cpu(va_block,
service_context->block_context.hmm.vma,
service_context->block_context.hmm.src_pfns,
service_context->block_context->hmm.vma,
service_context->block_context->hmm.src_pfns,
region,
service_context->access_type,
&fault_context.same_devmem_page_mask);
@@ -2476,9 +2559,9 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
}
}
args->vma = service_context->block_context.hmm.vma;
args->src = service_context->block_context.hmm.src_pfns + region.first;
args->dst = service_context->block_context.hmm.dst_pfns + region.first;
args->vma = service_context->block_context->hmm.vma;
args->src = service_context->block_context->hmm.src_pfns + region.first;
args->dst = service_context->block_context->hmm.dst_pfns + region.first;
args->start = uvm_va_block_region_start(va_block, region);
args->end = uvm_va_block_region_end(va_block, region) + 1;
args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
@@ -2558,7 +2641,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
// TODO: Bug 4050579: Remove this when swap cached pages can be
// migrated.
if (service_context) {
service_context->block_context.hmm.swap_cached = true;
service_context->block_context->hmm.swap_cached = true;
break;
}
@@ -2574,7 +2657,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
}
else {
status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
@@ -2588,8 +2671,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
// migrate_vma_setup() was able to isolate and lock the page;
// therefore, it is CPU resident and not mapped.
uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
uvm_page_mask_set(&va_block->cpu.resident, page_index);
uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index);
}
// The call to migrate_vma_setup() will have inserted a migration
@@ -2604,7 +2686,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
hmm_va_block_cpu_page_unpopulate(va_block, page_index);
hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL);
}
}
@@ -2618,7 +2700,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
}
if (uvm_page_mask_empty(page_mask) ||
(service_context && service_context->block_context.hmm.swap_cached))
(service_context && service_context->block_context->hmm.swap_cached))
status = NV_WARN_MORE_PROCESSING_REQUIRED;
if (status != NV_OK)
@@ -2649,8 +2731,8 @@ static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
service_context = uvm_hmm_gpu_fault_event->service_context;
region = service_context->region;
prefetch_hint = &service_context->prefetch_hint;
src_pfns = service_context->block_context.hmm.src_pfns;
dst_pfns = service_context->block_context.hmm.dst_pfns;
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
// Build the migration mask.
// Note that thrashing pinned pages are already accounted for in
@@ -2708,8 +2790,8 @@ static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *u
va_block = uvm_hmm_gpu_fault_event->va_block;
va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
service_context = uvm_hmm_gpu_fault_event->service_context;
src_pfns = service_context->block_context.hmm.src_pfns;
dst_pfns = service_context->block_context.hmm.dst_pfns;
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
region = service_context->region;
page_mask = &uvm_hmm_gpu_fault_event->page_mask;
@@ -2752,11 +2834,11 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
uvm_va_block_retry_t *va_block_retry,
uvm_service_block_context_t *service_context)
{
struct mm_struct *mm = service_context->block_context.mm;
struct vm_area_struct *vma = service_context->block_context.hmm.vma;
struct mm_struct *mm = service_context->block_context->mm;
struct vm_area_struct *vma = service_context->block_context->hmm.vma;
uvm_va_block_region_t region = service_context->region;
uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
int ret;
NV_STATUS status = NV_ERR_INVALID_ADDRESS;
@@ -2780,8 +2862,8 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
uvm_hmm_gpu_fault_event.service_context = service_context;
args->vma = vma;
args->src = service_context->block_context.hmm.src_pfns + region.first;
args->dst = service_context->block_context.hmm.dst_pfns + region.first;
args->src = service_context->block_context->hmm.src_pfns + region.first;
args->dst = service_context->block_context->hmm.dst_pfns + region.first;
args->start = uvm_va_block_region_start(va_block, region);
args->end = uvm_va_block_region_end(va_block, region) + 1;
args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
@@ -2815,8 +2897,8 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
// since migrate_vma_setup() would have reported that information.
// Try to make it resident in system memory and retry the migration.
status = hmm_make_resident_cpu(va_block,
service_context->block_context.hmm.vma,
service_context->block_context.hmm.src_pfns,
service_context->block_context->hmm.vma,
service_context->block_context->hmm.src_pfns,
region,
service_context->access_type,
NULL);
@@ -2962,16 +3044,6 @@ static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migra
&uvm_hmm_migrate_event->same_devmem_page_mask);
}
static bool is_resident(uvm_va_block_t *va_block,
uvm_processor_id_t dest_id,
uvm_va_block_region_t region)
{
if (!uvm_processor_mask_test(&va_block->resident, dest_id))
return false;
return uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, dest_id), region);
}
// Note that migrate_vma_*() doesn't handle asynchronous migrations so the
// migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect.
// TODO: Bug 3900785: investigate ways to implement async migration.
@@ -3063,9 +3135,7 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
uvm_page_mask_init_from_region(page_mask, region, NULL);
for_each_id_in_mask(id, &va_block->resident) {
if (!uvm_page_mask_andnot(page_mask,
page_mask,
uvm_va_block_resident_mask_get(va_block, id)))
if (!uvm_page_mask_andnot(page_mask, page_mask, uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE)))
return NV_OK;
}
@@ -3193,6 +3263,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
const uvm_va_policy_t *policy;
uvm_va_policy_node_t *node;
uvm_page_mask_t *cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
unsigned long npages;
NV_STATUS status;
@@ -3215,7 +3286,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
// Pages resident on the GPU should not have a resident page in system
// memory.
// TODO: Bug 3660922: Need to handle read duplication at some point.
UVM_ASSERT(uvm_page_mask_region_empty(&va_block->cpu.resident, region));
UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region));
status = alloc_and_copy_to_cpu(va_block,
NULL,
@@ -3314,35 +3385,34 @@ NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
NULL);
}
NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
{
unsigned long src_pfn = 0;
unsigned long dst_pfn = 0;
struct page *dst_page;
NV_STATUS status = NV_OK;
unsigned long src_pfn;
unsigned long dst_pfn;
struct migrate_vma args;
struct page *src_page = vmf->page;
uvm_tracker_t tracker = UVM_TRACKER_INIT();
int ret;
ret = migrate_device_range(&src_pfn, pfn, 1);
if (ret)
return errno_to_nv_status(ret);
args.vma = vmf->vma;
args.src = &src_pfn;
args.dst = &dst_pfn;
args.start = nv_page_fault_va(vmf);
args.end = args.start + PAGE_SIZE;
args.pgmap_owner = &g_uvm_global;
args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
args.fault_page = src_page;
// We don't call migrate_vma_setup_locked() here because we don't
// have a va_block and don't want to ignore invalidations.
ret = migrate_vma_setup(&args);
UVM_ASSERT(!ret);
if (src_pfn & MIGRATE_PFN_MIGRATE) {
// All the code for copying a vidmem page to sysmem relies on
// having a va_block. However certain combinations of mremap()
// and fork() can result in device-private pages being mapped
// in a child process without a va_block.
//
// We don't expect the above to be a common occurance so for
// now we allocate a fresh zero page when evicting without a
// va_block. However this results in child processes losing
// data so make sure we warn about it. Ideally we would just
// not migrate and SIGBUS the child if it tries to access the
// page. However that would prevent unloading of the driver so
// we're stuck with this until we fix the problem.
// TODO: Bug 3902536: add code to migrate GPU memory without having a
// va_block.
WARN_ON(1);
dst_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO);
struct page *dst_page;
dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
if (!dst_page) {
status = NV_ERR_NO_MEMORY;
goto out;
@@ -3351,11 +3421,15 @@ NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
lock_page(dst_page);
dst_pfn = migrate_pfn(page_to_pfn(dst_page));
migrate_device_pages(&src_pfn, &dst_pfn, 1);
status = uvm_hmm_copy_devmem_page(dst_page, src_page, &tracker);
if (status == NV_OK)
status = uvm_tracker_wait_deinit(&tracker);
}
migrate_vma_pages(&args);
out:
migrate_device_finalize(&src_pfn, &dst_pfn, 1);
migrate_vma_finalize(&args);
return status;
}
@@ -3606,4 +3680,3 @@ bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
}
#endif // UVM_IS_CONFIG_HMM()

View File

@@ -307,10 +307,10 @@ typedef struct
uvm_migrate_mode_t mode,
uvm_tracker_t *out_tracker);
// Evicts all va_blocks in the va_space to the CPU. Unlike the
// other va_block eviction functions this is based on virtual
// address and therefore takes mmap_lock for read.
void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space);
// Handle a fault to a device-private page from a process other than the
// process which created the va_space that originally allocated the
// device-private page.
NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf);
// This sets the va_block_context->hmm.src_pfns[] to the ZONE_DEVICE private
// PFN for the GPU chunk memory.
@@ -343,14 +343,6 @@ typedef struct
const uvm_page_mask_t *pages_to_evict,
uvm_va_block_region_t region);
// Migrate a GPU device-private page to system memory. This is
// called to remove CPU page table references to device private
// struct pages for the given GPU after all other references in
// va_blocks have been released and the GPU is in the process of
// being removed/torn down. Note that there is no mm, VMA,
// va_block or any user channel activity on this GPU.
NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn);
// This returns what would be the intersection of va_block start/end and
// VMA start/end-1 for the given 'lookup_address' if
// uvm_hmm_va_block_find_create() was called.
@@ -592,8 +584,10 @@ typedef struct
return NV_ERR_INVALID_ADDRESS;
}
static void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
static NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
{
UVM_ASSERT(0);
return NV_ERR_INVALID_ADDRESS;
}
static NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
@@ -622,11 +616,6 @@ typedef struct
return NV_OK;
}
static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
{
return NV_OK;
}
static NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
struct mm_struct *mm,
NvU64 lookup_address,

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2020-2022 NVIDIA Corporation
Copyright (c) 2020-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -59,12 +59,12 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
// Physical CE writes to vidmem are non-coherent with respect to the CPU on
// GH180.
parent_gpu->ce_phys_vidmem_write_supported = !uvm_gpu_is_coherent(parent_gpu);
parent_gpu->ce_phys_vidmem_write_supported = !uvm_parent_gpu_is_coherent(parent_gpu);
// TODO: Bug 4174553: [HGX-SkinnyJoe][GH180] channel errors discussion/debug
// portion for the uvm tests became nonresponsive after
// some time and then failed even after reboot
parent_gpu->peer_copy_mode = uvm_gpu_is_coherent(parent_gpu) ?
parent_gpu->peer_copy_mode = uvm_parent_gpu_is_coherent(parent_gpu) ?
UVM_GPU_PEER_COPY_MODE_VIRTUAL : g_uvm_global.peer_copy_mode;
// All GR context buffers may be mapped to 57b wide VAs. All "compute" units

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2020-2023 NVIDIA Corporation
Copyright (c) 2020-2022 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -368,10 +368,7 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
return pde_bits;
}
static void make_pde_hopper(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
static void make_pde_hopper(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
{
NvU32 entry_count = entries_per_index_hopper(depth);
NvU64 *entry_bits = (NvU64 *)entry;

View File

@@ -128,8 +128,9 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
// present if we see the callback.
//
// The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
// v3.19 (2014-11-13).
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
#define UVM_CAN_USE_MMU_NOTIFIERS() 1
#else
#define UVM_CAN_USE_MMU_NOTIFIERS() 0
@@ -348,6 +349,47 @@ static inline NvU64 NV_GETTIME(void)
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
#endif
#if !defined(NV_FIND_NEXT_BIT_WRAP_PRESENT)
static inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset)
{
unsigned long bit = find_next_bit(addr, size, offset);
if (bit < size)
return bit;
bit = find_first_bit(addr, offset);
return bit < offset ? bit : size;
}
#endif
// for_each_set_bit_wrap and __for_each_wrap were introduced in v6.1-rc1
// by commit 4fe49b3b97c2640147c46519c2a6fdb06df34f5f
#if !defined(for_each_set_bit_wrap)
static inline unsigned long __for_each_wrap(const unsigned long *bitmap,
unsigned long size,
unsigned long start,
unsigned long n)
{
unsigned long bit;
if (n > start) {
bit = find_next_bit(bitmap, size, n);
if (bit < size)
return bit;
n = 0;
}
bit = find_next_bit(bitmap, start, n);
return bit < start ? bit : size;
}
#define for_each_set_bit_wrap(bit, addr, size, start) \
for ((bit) = find_next_bit_wrap((addr), (size), (start)); \
(bit) < (size); \
(bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
#endif
// Added in 2.6.24
#ifndef ACCESS_ONCE
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
@@ -579,4 +621,5 @@ static inline pgprot_t uvm_pgprot_decrypted(pgprot_t prot)
#include <asm/page.h>
#define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x)))
#endif
#endif // _UVM_LINUX_H

View File

@@ -355,6 +355,7 @@ static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_
if (!ext_gpu_map->mem_handle)
return UVM_MEMBAR_GPU;
// EGM uses the same barriers as sysmem.
return uvm_hal_downgrade_membar_type(ext_gpu_map->gpu,
!ext_gpu_map->is_sysmem && ext_gpu_map->gpu == ext_gpu_map->owning_gpu);
}
@@ -633,6 +634,8 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
const UvmGpuMemoryInfo *mem_info)
{
uvm_gpu_t *owning_gpu;
if (mem_info->egm)
UVM_ASSERT(mem_info->sysmem);
if (!mem_info->deviceDescendant && !mem_info->sysmem) {
ext_gpu_map->owning_gpu = NULL;
@@ -641,6 +644,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
}
// This is a local or peer allocation, so the owning GPU must have been
// registered.
// This also checks for if EGM owning GPU is registered.
owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
if (!owning_gpu)
return NV_ERR_INVALID_DEVICE;
@@ -651,13 +655,10 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
// crashes when it's eventually freed.
// TODO: Bug 1811006: Bug tracking the RM issue, its fix might change the
// semantics of sysmem allocations.
if (mem_info->sysmem) {
ext_gpu_map->owning_gpu = owning_gpu;
ext_gpu_map->is_sysmem = true;
return NV_OK;
}
if (owning_gpu != mapping_gpu) {
// Check if peer access for peer memory is enabled.
// This path also handles EGM allocations.
if (owning_gpu != mapping_gpu && (!mem_info->sysmem || mem_info->egm)) {
// TODO: Bug 1757136: In SLI, the returned UUID may be different but a
// local mapping must be used. We need to query SLI groups to know
// that.
@@ -666,7 +667,9 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
}
ext_gpu_map->owning_gpu = owning_gpu;
ext_gpu_map->is_sysmem = false;
ext_gpu_map->is_sysmem = mem_info->sysmem;
ext_gpu_map->is_egm = mem_info->egm;
return NV_OK;
}
@@ -719,6 +722,7 @@ static NV_STATUS uvm_ext_gpu_map_split(uvm_range_tree_t *tree,
new->gpu = existing_map->gpu;
new->owning_gpu = existing_map->owning_gpu;
new->is_sysmem = existing_map->is_sysmem;
new->is_egm = existing_map->is_egm;
// Initialize the new ext_gpu_map tracker as a copy of the existing_map tracker.
// This way, any operations on any of the two ext_gpu_maps will be able to

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2023 NVIDIA Corporation
Copyright (c) 2016-2021 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -106,10 +106,7 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
return pde_bits;
}
static void make_pde_maxwell(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
static void make_pde_maxwell(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
{
NvU64 pde_bits = 0;
UVM_ASSERT(depth == 0);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2023 NVIDIA Corporation
Copyright (c) 2016-2022 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -93,9 +93,8 @@ static bool sysmem_can_be_mapped_on_gpu(uvm_mem_t *sysmem)
{
UVM_ASSERT(uvm_mem_is_sysmem(sysmem));
// In Confidential Computing, only unprotected memory can be mapped on the
// GPU
if (g_uvm_global.conf_computing_enabled)
// If SEV is enabled, only unprotected memory can be mapped
if (g_uvm_global.sev_enabled)
return uvm_mem_is_sysmem_dma(sysmem);
return true;
@@ -738,7 +737,7 @@ static NV_STATUS mem_map_cpu_to_sysmem_kernel(uvm_mem_t *mem)
pages[page_index] = mem_cpu_page(mem, page_index * PAGE_SIZE);
}
if (g_uvm_global.conf_computing_enabled && uvm_mem_is_sysmem_dma(mem))
if (g_uvm_global.sev_enabled && uvm_mem_is_sysmem_dma(mem))
prot = uvm_pgprot_decrypted(PAGE_KERNEL_NOENC);
mem->kernel.cpu_addr = vmap(pages, num_pages, VM_MAP, prot);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2023 NVIDIA Corporation
Copyright (c) 2016-2021 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -44,10 +44,10 @@ static NvU32 first_page_size(NvU32 page_sizes)
static inline NV_STATUS __alloc_map_sysmem(NvU64 size, uvm_gpu_t *gpu, uvm_mem_t **sys_mem)
{
if (g_uvm_global.conf_computing_enabled)
if (g_uvm_global.sev_enabled)
return uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(size, gpu, current->mm, sys_mem);
return uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, sys_mem);
else
return uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, sys_mem);
}
static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
@@ -335,6 +335,9 @@ error:
static bool should_test_page_size(size_t alloc_size, NvU32 page_size)
{
if (g_uvm_global.sev_enabled)
return false;
if (g_uvm_global.num_simulated_devices == 0)
return true;

View File

@@ -130,9 +130,9 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
NV_STATUS status = NV_OK;
NV_STATUS tracker_status;
// Save the mask of unmapped pages because it will change after the
// Get the mask of unmapped pages because it will change after the
// first map operation
uvm_page_mask_complement(&va_block_context->caller_page_mask, &va_block->maybe_mapped_pages);
uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);
if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
// Do not map pages that are already resident on the CPU. This is in
@@ -147,7 +147,7 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
// such pages at all, when migrating.
uvm_page_mask_andnot(&va_block_context->caller_page_mask,
&va_block_context->caller_page_mask,
uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU));
uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
}
// Only map those pages that are not mapped anywhere else (likely due
@@ -377,7 +377,7 @@ static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block,
mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU);
if (uvm_processor_mask_test(&va_block->resident, dest_id)) {
const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id);
const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask;
// TODO: Bug 1877578

View File

@@ -672,14 +672,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
.finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
};
// WAR for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
//
// This code path isn't used on GH180 but we need to maintain consistent
// behaviour on systems that do.
if (!vma_is_anonymous(args->vma))
return NV_WARN_NOTHING_TO_DO;
ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
if (ret < 0)
return errno_to_nv_status(ret);
@@ -693,24 +685,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
if (ret < 0)
return errno_to_nv_status(ret);
// TODO: Bug 2419180: support file-backed pages in migrate_vma, when
// support for it is added to the Linux kernel
//
// A side-effect of migrate_vma_setup() is it calls mmu notifiers even if a
// page can't be migrated (eg. because it's a non-anonymous mapping). We
// need this side-effect for SMMU on GH180 to ensure any cached read-only
// entries are flushed from SMMU on permission upgrade.
//
// TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
//
// The above WAR doesn't work for HugeTLBfs mappings because
// migrate_vma_setup() will fail in that case.
if (!vma_is_anonymous(args->vma)) {
migrate_vma_finalize(args);
return NV_WARN_NOTHING_TO_DO;
}
uvm_migrate_vma_alloc_and_copy(args, state);
if (state->status == NV_OK) {
migrate_vma_pages(args);
@@ -884,13 +858,9 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
start = max(start, vma->vm_start);
outer = min(outer, vma->vm_end);
// migrate_vma only supports anonymous VMAs. We check for those after
// calling migrate_vma_setup() to workaround Bug 4130089. We need to check
// for HugeTLB VMAs here because migrate_vma_setup() will return a fatal
// error for those.
// TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
if (is_vm_hugetlb_page(vma))
// TODO: Bug 2419180: support file-backed pages in migrate_vma, when
// support for it is added to the Linux kernel
if (!vma_is_anonymous(vma))
return NV_WARN_NOTHING_TO_DO;
if (uvm_processor_mask_empty(&va_space->registered_gpus))

View File

@@ -51,7 +51,7 @@ typedef struct
#if defined(CONFIG_MIGRATE_VMA_HELPER)
#define UVM_MIGRATE_VMA_SUPPORTED 1
#else
#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
#if defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_VMA_SETUP_PRESENT)
#define UVM_MIGRATE_VMA_SUPPORTED 1
#endif
#endif

View File

@@ -323,153 +323,37 @@ static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
}
static void pde_fill_cpu(uvm_page_tree_t *tree,
uvm_page_directory_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr)
{
NvU64 pde_data[2], entry_size;
NvU32 i;
UVM_ASSERT(uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(directory->depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
for (i = 0; i < pde_count; i++) {
tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i]);
if (entry_size == sizeof(pde_data[0]))
uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
else
uvm_mmu_page_table_cpu_memset_16(tree->gpu, &directory->phys_alloc, start_index + i, pde_data, 1);
}
}
static void pde_fill_gpu(uvm_page_tree_t *tree,
uvm_page_directory_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
NvU64 pde_data[2], entry_size;
uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->phys_alloc.addr);
NvU32 max_inline_entries;
uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
uvm_gpu_address_t inline_data_addr;
uvm_push_inline_data_t inline_data;
NvU32 entry_count, i, j;
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(directory->depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / entry_size;
if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
pde_entry_addr.address += start_index * entry_size;
for (i = 0; i < pde_count;) {
// All but the first memory operation can be pipelined. We respect the
// caller's pipelining settings for the first push.
if (i != 0)
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
entry_count = min(pde_count - i, max_inline_entries);
// No membar is needed until the last memory operation. Otherwise,
// use caller's membar flag.
if ((i + entry_count) < pde_count)
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
uvm_push_set_flag(push, push_membar_flag);
uvm_push_inline_data_begin(push, &inline_data);
for (j = 0; j < entry_count; j++) {
tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i + j]);
uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
}
inline_data_addr = uvm_push_inline_data_end(&inline_data);
tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * entry_size);
i += entry_count;
pde_entry_addr.address += entry_size * entry_count;
}
}
// pde_fill() populates pde_count PDE entries (starting at start_index) with
// the same mapping, i.e., with the same physical address (phys_addr).
// pde_fill() is optimized for pde_count == 1, which is the common case. The
// map_remap() function is the only case where pde_count > 1, only used on GA100
// GPUs for 512MB page size mappings.
static void pde_fill(uvm_page_tree_t *tree,
uvm_page_directory_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, directory->depth, UVM_PAGE_SIZE_AGNOSTIC));
if (push)
pde_fill_gpu(tree, directory, start_index, pde_count, phys_addr, push);
else
pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
}
static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
{
NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
NvU64 clear_bits[2];
uvm_mmu_mode_hal_t *hal = tree->hal;
// Passing in NULL for the phys_allocs will mark the child entries as
// invalid.
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
// Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
// than 512 entries. We initialize them all with the same clean PDE.
// Additionally, only ATS systems may require clean PDEs bit settings based
// on the mapping VA.
if (dir->depth == tree->hal->page_table_depth(page_size) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
NvU64 clear_bits[2];
// If it is not a PTE, make a clean PDE.
if (dir->depth != tree->hal->page_table_depth(page_size)) {
tree->hal->make_pde(clear_bits, phys_allocs, dir->depth, dir->entries[0]);
// Make sure that using only clear_bits[0] will work.
UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
}
else {
*clear_bits = 0;
}
// Initialize the memory to a reasonable value.
if (push) {
tree->gpu->parent->ce_hal->memset_8(push,
uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
*clear_bits,
dir->phys_alloc.size);
}
else {
uvm_mmu_page_table_cpu_memset_8(tree->gpu,
&dir->phys_alloc,
0,
*clear_bits,
dir->phys_alloc.size / sizeof(*clear_bits));
}
if (dir->depth == tree->hal->page_table_depth(page_size)) {
*clear_bits = 0; // Invalid PTE
}
else {
pde_fill(tree, dir, 0, entries_count, phys_allocs, push);
// passing in NULL for the phys_allocs will mark the child entries as invalid
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
hal->make_pde(clear_bits, phys_allocs, dir->depth);
// Make sure that using only clear_bits[0] will work
UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
}
// initialize the memory to a reasonable value
if (push) {
tree->gpu->parent->ce_hal->memset_8(push,
uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
*clear_bits,
dir->phys_alloc.size);
}
else {
uvm_mmu_page_table_cpu_memset_8(tree->gpu,
&dir->phys_alloc,
0,
*clear_bits,
dir->phys_alloc.size / sizeof(*clear_bits));
}
}
static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@@ -483,10 +367,8 @@ static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
uvm_page_directory_t *dir;
// The page tree doesn't cache PTEs so space is not allocated for entries
// that are always PTEs.
// 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not
// page_size.
// The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
// 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
entry_count = 0;
else
@@ -527,6 +409,108 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
}
static void pde_fill_cpu(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr)
{
NvU64 pde_data[2], entry_size;
UVM_ASSERT(uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
tree->hal->make_pde(pde_data, phys_addr, depth);
if (entry_size == sizeof(pde_data[0]))
uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
else
uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
}
static void pde_fill_gpu(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
NvU64 pde_data[2], entry_size;
uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
tree->hal->make_pde(pde_data, phys_addr, depth);
pde_entry_addr.address += start_index * entry_size;
if (entry_size == sizeof(pde_data[0])) {
tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
}
else {
NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
uvm_gpu_address_t inline_data_addr;
uvm_push_inline_data_t inline_data;
uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
NvU32 i;
if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
for (i = 0; i < pde_count;) {
NvU32 j;
NvU32 entry_count = min(pde_count - i, max_inline_entries);
uvm_push_inline_data_begin(push, &inline_data);
for (j = 0; j < entry_count; j++)
uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
inline_data_addr = uvm_push_inline_data_end(&inline_data);
// All but the first memcopy can be pipelined. We respect the
// caller's pipelining settings for the first push.
if (i != 0)
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
// No membar is needed until the last copy. Otherwise, use
// caller's membar flag.
if (i + entry_count < pde_count)
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
uvm_push_set_flag(push, push_membar_flag);
tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
i += entry_count;
pde_entry_addr.address += sizeof(pde_data) * entry_count;
}
}
}
// pde_fill() populates pde_count PDE entries (starting at start_index) with
// the same mapping, i.e., with the same physical address (phys_addr).
static void pde_fill(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
if (push)
pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
else
pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
}
static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
uvm_page_directory_t *parent,
NvU32 index_in_parent)
@@ -556,7 +540,7 @@ static void pde_write(uvm_page_tree_t *tree,
phys_allocs[i] = &entry->phys_alloc;
}
pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
}
static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
@@ -829,11 +813,8 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm
static void map_remap_deinit(uvm_page_tree_t *tree)
{
if (tree->map_remap.pde0) {
phys_mem_deallocate(tree, &tree->map_remap.pde0->phys_alloc);
uvm_kvfree(tree->map_remap.pde0);
tree->map_remap.pde0 = NULL;
}
if (tree->map_remap.pde0.size)
phys_mem_deallocate(tree, &tree->map_remap.pde0);
if (tree->map_remap.ptes_invalid_4k.size)
phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
@@ -858,16 +839,10 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
// PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
// return the PTE for the get_ptes()'s caller.
if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
tree->map_remap.pde0 = allocate_directory(tree,
UVM_PAGE_SIZE_2M,
tree->hal->page_table_depth(UVM_PAGE_SIZE_2M),
UVM_PMM_ALLOC_FLAGS_EVICT);
if (tree->map_remap.pde0 == NULL) {
status = NV_ERR_NO_MEMORY;
status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
if (status != NV_OK)
goto error;
}
}
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
if (status != NV_OK)
goto error;
@@ -889,23 +864,22 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
NvU32 pde0_entries = tree->map_remap.pde0->phys_alloc.size / tree->hal->entry_size(tree->map_remap.pde0->depth);
// pde0 depth equals UVM_PAGE_SIZE_2M.
NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);
// The big-page entry is NULL which makes it an invalid entry.
phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;
// By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
// sufficient when pde0 is allocated in VIDMEM.
if (tree->map_remap.pde0->phys_alloc.addr.aperture == UVM_APERTURE_VID)
if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
// This is an orphan directory, make_pde() requires a directory to
// compute the VA. The UVM depth map_remap() operates on is not in the
// range make_pde() must operate. We only need to supply the fields used
// by make_pde() to not access invalid memory addresses.
pde_fill(tree,
tree->map_remap.pde0,
pde0_depth,
&tree->map_remap.pde0,
0,
pde0_entries,
(uvm_mmu_page_table_alloc_t **)&phys_allocs,
@@ -932,10 +906,11 @@ error:
// --------------|-------------------------||----------------|----------------
// vidmem | - || vidmem | false
// sysmem | - || sysmem | false
// default | <not set> || vidmem | true
// default | <not set> || vidmem | true (1)
// default | vidmem || vidmem | false
// default | sysmem || sysmem | false
//
// (1) When SEV mode is enabled, the fallback path is disabled.
//
// In SR-IOV heavy the the page tree must be in vidmem, to prevent guest drivers
// from updating GPU page tables without hypervisor knowledge.
@@ -951,27 +926,28 @@ error:
//
static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t location)
{
bool should_location_be_vidmem;
UVM_ASSERT(tree->gpu != NULL);
UVM_ASSERT_MSG((location == UVM_APERTURE_VID) ||
(location == UVM_APERTURE_SYS) ||
(location == UVM_APERTURE_DEFAULT),
"Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);
// The page tree of a "fake" GPU used during page tree testing can be in
// sysmem in scenarios where a "real" GPU must be in vidmem. Fake GPUs can
// be identified by having no channel manager.
if (tree->gpu->channel_manager != NULL) {
should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu)
|| uvm_conf_computing_mode_enabled(tree->gpu);
if (uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu))
UVM_ASSERT(location == UVM_APERTURE_VID);
else if (uvm_conf_computing_mode_enabled(tree->gpu))
UVM_ASSERT(location == UVM_APERTURE_VID);
}
// The page tree of a "fake" GPU used during page tree testing can be in
// sysmem even if should_location_be_vidmem is true. A fake GPU can be
// identified by having no channel manager.
if ((tree->gpu->channel_manager != NULL) && should_location_be_vidmem)
UVM_ASSERT(location == UVM_APERTURE_VID);
if (location == UVM_APERTURE_DEFAULT) {
if (page_table_aperture == UVM_APERTURE_DEFAULT) {
tree->location = UVM_APERTURE_VID;
tree->location_sys_fallback = true;
// See the comment (1) above.
tree->location_sys_fallback = !g_uvm_global.sev_enabled;
}
else {
tree->location = page_table_aperture;
@@ -1358,9 +1334,10 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
phys_alloc[0] = &tree->map_remap.pde0->phys_alloc;
phys_alloc[0] = &tree->map_remap.pde0;
pde_fill(tree,
range->table,
range->table->depth,
&range->table->phys_alloc,
range->start_index,
range->entry_count,
(uvm_mmu_page_table_alloc_t **)&phys_alloc,

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2023 NVIDIA Corporation
Copyright (c) 2015-2022 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -219,7 +219,7 @@ struct uvm_mmu_mode_hal_struct
// point to two items for dual PDEs).
// any of allocs are allowed to be NULL, in which case they are to be
// treated as empty.
void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth, uvm_page_directory_t *child_dir);
void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth);
// size of an entry in a directory/table. Generally either 8 or 16 bytes.
// (in the case of Pascal dual PDEs)
@@ -229,7 +229,7 @@ struct uvm_mmu_mode_hal_struct
NvU32 (*entries_per_index)(NvU32 depth);
// For dual PDEs, this is ether 1 or 0, depending on the page size.
// This is used to index the host copy only. GPU PDEs are always entirely
// This is used to index the host copy only. GPU PDEs are always entirely
// re-written using make_pde.
NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);
@@ -295,8 +295,9 @@ struct uvm_page_tree_struct
// PDE0 where all big-page entries are invalid, and small-page entries
// point to ptes_invalid_4k.
// pde0 is used on Pascal+ GPUs, i.e., they have the same PDE format.
uvm_page_directory_t *pde0;
// pde0 is only used on Pascal-Ampere, i.e., they have the same PDE
// format.
uvm_mmu_page_table_alloc_t pde0;
} map_remap;
// Tracker for all GPU operations on the tree
@@ -364,32 +365,21 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
// the same page size without an intervening put_ptes. To duplicate a subset of
// an existing range or change the size of an existing range, use
// uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
NvU32 page_size,
NvU64 start,
NvLength size,
uvm_pmm_alloc_flags_t pmm_flags,
uvm_page_table_range_t *range);
NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
// Same as uvm_page_tree_get_ptes(), but doesn't synchronize the GPU work.
//
// All pending operations can be waited on with uvm_page_tree_wait().
NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
NvU32 page_size,
NvU64 start,
NvLength size,
uvm_pmm_alloc_flags_t pmm_flags,
uvm_page_table_range_t *range);
NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
// Returns a single-entry page table range for the addresses passed.
// The size parameter must be a page size supported by this tree.
// This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
// page_size.
NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
NvU32 page_size,
NvU64 start,
uvm_pmm_alloc_flags_t pmm_flags,
uvm_page_table_range_t *single);
NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start,
uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *single);
// For a single-entry page table range, write the PDE (which could be a dual
// PDE) to the GPU.
@@ -488,8 +478,8 @@ NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
// new_range_vec will contain the upper portion of range_vec, starting at
// new_end + 1.
//
// new_end + 1 is required to be within the address range of range_vec and be
// aligned to range_vec's page_size.
// new_end + 1 is required to be within the address range of range_vec and be aligned to
// range_vec's page_size.
//
// On failure, the original range vector is left unmodified.
NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
@@ -511,22 +501,18 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec);
// for each offset.
// The caller_data pointer is what the caller passed in as caller_data to
// uvm_page_table_range_vec_write_ptes().
typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec,
NvU64 offset,
void *caller_data);
typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec, NvU64 offset,
void *caller_data);
// Write all PTEs covered by the range vector using the given PTE making
// function.
// Write all PTEs covered by the range vector using the given PTE making function.
//
// After writing all the PTEs a TLB invalidate operation is performed including
// the passed in tlb_membar.
//
// See comments about uvm_page_table_range_pte_maker_t for details about the
// PTE making callback.
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker,
void *caller_data);
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker, void *caller_data);
// Set all PTEs covered by the range vector to an empty PTE
//
@@ -650,9 +636,8 @@ static NvU64 uvm_page_table_range_size(uvm_page_table_range_t *range)
// Get the physical address of the entry at entry_index within the range
// (counted from range->start_index).
static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree,
uvm_page_table_range_t *range,
size_t entry_index)
static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree, uvm_page_table_range_t *range,
size_t entry_index)
{
NvU32 entry_size = uvm_mmu_pte_size(tree, range->page_size);
uvm_gpu_phys_address_t entry = range->table->phys_alloc.addr;

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2023 NVIDIA Corporation
Copyright (c) 2015-2022 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -146,15 +146,9 @@ static void fake_tlb_invals_disable(void)
g_fake_tlb_invals_tracking_enabled = false;
}
// Fake TLB invalidate VA that just saves off the parameters so that they can be
// verified later.
static void fake_tlb_invalidate_va(uvm_push_t *push,
uvm_gpu_phys_address_t pdb,
NvU32 depth,
NvU64 base,
NvU64 size,
NvU32 page_size,
uvm_membar_t membar)
// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
{
if (!g_fake_tlb_invals_tracking_enabled)
return;
@@ -216,8 +210,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
}
if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
expected_membar ? "a" : "no",
uvm_membar_string(g_last_fake_inval->membar));
expected_membar ? "a" : "no",
uvm_membar_string(g_last_fake_inval->membar));
result = false;
}
@@ -236,8 +230,7 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
}
if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
g_last_fake_inval->base,
g_last_fake_inval->base + g_last_fake_inval->size);
g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
return false;
}
if (g_last_fake_inval->depth != expected_depth) {
@@ -254,16 +247,15 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);
if (g_fake_invals_count == 0) {
UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
base, base + size);
return false;
}
if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
base,
base + size,
inval->base,
inval->base + inval->size);
base, base + size,
inval->base, inval->base + inval->size);
return false;
}
if (inval->depth != expected_depth) {
@@ -278,13 +270,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
return true;
}
static bool assert_invalidate_range(NvU64 base,
NvU64 size,
NvU32 page_size,
bool allow_inval_all,
NvU32 range_depth,
NvU32 all_depth,
bool expected_membar)
static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
{
NvU32 i;
@@ -502,6 +488,7 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
return NV_OK;
}
static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
{
uvm_page_tree_t tree;
@@ -855,7 +842,6 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
TEST_CHECK_RET(range2.entry_count == 256);
TEST_CHECK_RET(range2.table->ref_count == 512);
TEST_CHECK_RET(range1.table == range2.table);
// 4k page is second entry in a dual PDE
TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
TEST_CHECK_RET(range1.start_index == 256);
@@ -885,7 +871,6 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
TEST_CHECK_RET(range64k.entry_count == 16);
TEST_CHECK_RET(range64k.table->ref_count == 16);
// 4k page is second entry in a dual PDE
TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
TEST_CHECK_RET(range64k.start_index == 16);
@@ -1045,13 +1030,10 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
// Depth 4
NvU64 extent_pte = UVM_PAGE_SIZE_2M;
// Depth 3
NvU64 extent_pde0 = extent_pte * (1ull << 8);
// Depth 2
NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
// Depth 1
NvU64 extent_pde2 = extent_pde1 * (1ull << 9);
@@ -1099,11 +1081,7 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
return status;
}
static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
NvU64 base,
NvU64 size,
NvU32 min_page_size,
NvU32 max_page_size)
static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
{
NV_STATUS status = NV_OK;
uvm_push_t push;
@@ -1227,11 +1205,7 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
if (*pte != expected_pte) {
UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
*pte,
expected_pte,
offset,
range_vec->start,
range_vec->size);
*pte, expected_pte, offset, range_vec->start, range_vec->size);
return false;
}
offset += range_vec->page_size;
@@ -1252,11 +1226,7 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
TEST_CHECK_RET(data.status == NV_OK);
TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
range_vec->start,
range_vec->size,
range_vec->page_size,
page_table_depth,
membar != UVM_MEMBAR_NONE));
range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));
fake_tlb_invals_disable();
@@ -1279,11 +1249,7 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
return NV_OK;
}
static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
NvU64 start,
NvU64 size,
NvU32 page_size,
uvm_page_table_range_vec_t **range_vec_out)
static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
{
uvm_page_table_range_vec_t *range_vec;
uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@@ -1586,17 +1552,17 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
memset(phys_allocs, 0, sizeof(phys_allocs));
hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
hal->make_pde(&pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits == 0x0L);
phys_allocs[0] = &alloc_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
hal->make_pde(&pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);
phys_allocs[0] = &alloc_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
hal->make_pde(&pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);
for (j = 0; j <= 2; j++) {
@@ -1666,7 +1632,6 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
// big versions have [11:8] set as well to test the page table merging
uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);
@@ -1674,31 +1639,31 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, 3);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
// Sys and vidmem PDEs
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);
// Dual PDEs
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, 3);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, 3);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);
// uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@@ -1762,36 +1727,36 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, 3);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
// Sys and vidmem PDEs
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);
// Dual PDEs
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, 3);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 3, NULL);
hal->make_pde(pde_bits, phys_allocs, 3);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);
// NO_ATS PDE1 (depth 2)
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 2, NULL);
hal->make_pde(pde_bits, phys_allocs, 2);
if (g_uvm_global.ats.enabled)
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
else
@@ -1840,32 +1805,32 @@ static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func ent
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
// Cleared PDEs work as expected for big and small PDEs.
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 4, NULL);
hal->make_pde(pde_bits, phys_allocs, 4);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
// Sys and vidmem PDEs, uncached ATS allowed.
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0, NULL);
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);
// Dual PDEs, uncached.
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 4, NULL);
hal->make_pde(pde_bits, phys_allocs, 4);
TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 4, NULL);
hal->make_pde(pde_bits, phys_allocs, 4);
TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);
// uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
@@ -2338,8 +2303,7 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
gpu->parent = parent_gpu;
// At least test_tlb_invalidates() relies on global state
// (g_tlb_invalidate_*) so make sure only one test instance can run at a
// time.
// (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
uvm_mutex_lock(&g_uvm_global.global_lock);
// Allocate the fake TLB tracking state. Notably tests still need to enable

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2023 NVIDIA Corporation
Copyright (c) 2015-2020 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -140,10 +140,7 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
return pde_bits;
}
static void make_pde_pascal(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
{
NvU32 entry_count = entries_per_index_pascal(depth);
NvU64 *entry_bits = (NvU64 *)entry;

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2019 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -22,10 +22,7 @@
*******************************************************************************/
#include "uvm_perf_events.h"
#include "uvm_va_block.h"
#include "uvm_va_range.h"
#include "uvm_va_space.h"
#include "uvm_kvmalloc.h"
#include "uvm_test.h"
// Global variable used to check that callbacks are correctly executed
@@ -46,10 +43,7 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
NV_STATUS status;
uvm_perf_event_data_t event_data;
uvm_va_block_t block;
test_data = 0;
memset(&event_data, 0, sizeof(event_data));
// Use CPU id to avoid triggering the GPU stats update code
@@ -58,6 +52,7 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
// Register a callback for page fault
status = uvm_perf_register_event_callback(&va_space->perf_events, UVM_PERF_EVENT_FAULT, callback_inc_1);
TEST_CHECK_GOTO(status == NV_OK, done);
// Register a callback for page fault
status = uvm_perf_register_event_callback(&va_space->perf_events, UVM_PERF_EVENT_FAULT, callback_inc_2);
TEST_CHECK_GOTO(status == NV_OK, done);
@@ -65,13 +60,14 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
// va_space read lock is required for page fault event notification
uvm_va_space_down_read(va_space);
// Notify (fake) page fault. The two registered callbacks for this event increment the value of test_value
event_data.fault.block = &block;
// Notify (fake) page fault. The two registered callbacks for this event
// increment the value of test_value
uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_FAULT, &event_data);
uvm_va_space_up_read(va_space);
// test_data was initialized to zero. It should have been incremented by 1 and 2, respectively in the callbacks
// test_data was initialized to zero. It should have been incremented by 1
// and 2, respectively in the callbacks
TEST_CHECK_GOTO(test_data == 3, done);
done:
@@ -96,4 +92,3 @@ NV_STATUS uvm_test_perf_events_sanity(UVM_TEST_PERF_EVENTS_SANITY_PARAMS *params
done:
return status;
}

View File

@@ -355,7 +355,7 @@ static NvU32 uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_blo
uvm_page_mask_zero(prefetch_pages);
if (UVM_ID_IS_CPU(new_residency) || va_block->gpus[uvm_id_gpu_index(new_residency)] != NULL)
resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency);
resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency, NUMA_NO_NODE);
// If this is a first-touch fault and the destination processor is the
// preferred location, populate the whole max_prefetch_region.

View File

@@ -164,7 +164,7 @@ typedef struct
uvm_spinlock_t lock;
uvm_va_block_context_t va_block_context;
uvm_va_block_context_t *va_block_context;
// Flag used to avoid scheduling delayed unpinning operations after
// uvm_perf_thrashing_stop has been called.
@@ -601,6 +601,14 @@ static va_space_thrashing_info_t *va_space_thrashing_info_create(uvm_va_space_t
va_space_thrashing = uvm_kvmalloc_zero(sizeof(*va_space_thrashing));
if (va_space_thrashing) {
uvm_va_block_context_t *block_context = uvm_va_block_context_alloc(NULL);
if (!block_context) {
uvm_kvfree(va_space_thrashing);
return NULL;
}
va_space_thrashing->pinned_pages.va_block_context = block_context;
va_space_thrashing->va_space = va_space;
va_space_thrashing_info_init_params(va_space_thrashing);
@@ -621,6 +629,7 @@ static void va_space_thrashing_info_destroy(uvm_va_space_t *va_space)
if (va_space_thrashing) {
uvm_perf_module_type_unset_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
uvm_va_block_context_free(va_space_thrashing->pinned_pages.va_block_context);
uvm_kvfree(va_space_thrashing);
}
}
@@ -1104,7 +1113,7 @@ static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
!uvm_processor_mask_test(&policy->accessed_by, processor_id));
if (uvm_processor_mask_test(&va_block->resident, processor_id)) {
const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id);
const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);
if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask,
&block_thrashing->pinned_pages.mask,
@@ -1312,9 +1321,8 @@ void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_
if (block_thrashing->last_time_stamp == 0 ||
uvm_id_equal(block_thrashing->last_processor, processor_id) ||
time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns) {
time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns)
goto done;
}
num_block_pages = uvm_va_block_size(va_block) / PAGE_SIZE;
@@ -1803,7 +1811,7 @@ static void thrashing_unpin_pages(struct work_struct *work)
struct delayed_work *dwork = to_delayed_work(work);
va_space_thrashing_info_t *va_space_thrashing = container_of(dwork, va_space_thrashing_info_t, pinned_pages.dwork);
uvm_va_space_t *va_space = va_space_thrashing->va_space;
uvm_va_block_context_t *va_block_context = &va_space_thrashing->pinned_pages.va_block_context;
uvm_va_block_context_t *va_block_context = va_space_thrashing->pinned_pages.va_block_context;
// Take the VA space lock so that VA blocks don't go away during this
// operation.
@@ -1937,7 +1945,6 @@ void uvm_perf_thrashing_unload(uvm_va_space_t *va_space)
// Make sure that there are not pending work items
if (va_space_thrashing) {
UVM_ASSERT(va_space_thrashing->pinned_pages.in_va_space_teardown);
UVM_ASSERT(list_empty(&va_space_thrashing->pinned_pages.list));
va_space_thrashing_info_destroy(va_space);

View File

@@ -3377,76 +3377,47 @@ uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
return gpu->id;
}
static void evict_orphan_pages(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
{
NvU32 i;
UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
UVM_ASSERT(chunk->suballoc);
for (i = 0; i < num_subchunks(chunk); i++) {
uvm_gpu_chunk_t *subchunk = chunk->suballoc->subchunks[i];
uvm_spin_lock(&pmm->list_lock);
if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) {
uvm_spin_unlock(&pmm->list_lock);
evict_orphan_pages(pmm, subchunk);
continue;
}
if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED && subchunk->is_referenced) {
unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(pmm, subchunk);
// TODO: Bug 3368756: add support for large GPU pages.
UVM_ASSERT(uvm_gpu_chunk_get_size(subchunk) == PAGE_SIZE);
uvm_spin_unlock(&pmm->list_lock);
// The above check for subchunk state is racy because the
// chunk may be freed after the lock is dropped. It is
// still safe to proceed in that case because the struct
// page reference will have dropped to zero and cannot
// have been re-allocated as this is only called during
// GPU teardown. Therefore migrate_device_range() will
// simply fail.
uvm_hmm_pmm_gpu_evict_pfn(pfn);
continue;
}
uvm_spin_unlock(&pmm->list_lock);
}
}
// Free any orphan pages.
// This should be called as part of removing a GPU: after all work is stopped
// and all va_blocks have been destroyed. There normally won't be any
// device private struct page references left but there can be cases after
// fork() where a child process still holds a reference. This function searches
// for pages that still have a reference and migrates the page to the GPU in
// order to release the reference in the CPU page table.
static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
// Check there are no orphan pages. This should be only called as part of
// removing a GPU: after all work is stopped and all va_blocks have been
// destroyed. By now there should be no device-private page references left as
// there are no va_space's left on this GPU and orphan pages should be removed
// by va_space destruction or unregistration from the GPU.
static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
{
size_t i;
bool ret = true;
unsigned long pfn;
struct range range = pmm->devmem.pagemap.range;
if (!pmm->initialized)
return;
// This is only safe to call during GPU teardown where chunks
// cannot be re-allocated.
UVM_ASSERT(uvm_gpu_retained_count(uvm_pmm_to_gpu(pmm)) == 0);
if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
return ret;
// Scan all the root chunks looking for subchunks which are still
// referenced. This is slow, but we only do this when unregistering a GPU
// and is not critical for performance.
// referenced.
for (i = 0; i < pmm->root_chunks.count; i++) {
uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
root_chunk_lock(pmm, root_chunk);
if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
evict_orphan_pages(pmm, &root_chunk->chunk);
ret = false;
root_chunk_unlock(pmm, root_chunk);
}
for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
struct page *page = pfn_to_page(pfn);
if (!is_device_private_page(page)) {
ret = false;
break;
}
if (page_count(page)) {
ret = false;
break;
}
}
return ret;
}
static void devmem_page_free(struct page *page)
@@ -3479,7 +3450,7 @@ static vm_fault_t devmem_fault(struct vm_fault *vmf)
{
uvm_va_space_t *va_space = vmf->page->zone_device_data;
if (!va_space || va_space->va_space_mm.mm != vmf->vma->vm_mm)
if (!va_space)
return VM_FAULT_SIGBUS;
return uvm_va_space_cpu_fault_hmm(va_space, vmf->vma, vmf);
@@ -3568,8 +3539,9 @@ static void devmem_deinit(uvm_pmm_gpu_t *pmm)
{
}
static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
{
return true;
}
#endif // UVM_IS_CONFIG_HMM()
@@ -3744,7 +3716,7 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
gpu = uvm_pmm_to_gpu(pmm);
uvm_pmm_gpu_free_orphan_pages(pmm);
UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
nv_kthread_q_flush(&gpu->parent->lazy_free_q);
UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
release_free_root_chunks(pmm);

View File

@@ -749,6 +749,7 @@ NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
}
static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,
int nid,
uvm_cpu_chunk_alloc_flags_t alloc_flags)
{
gfp_t kernel_alloc_flags;
@@ -764,18 +765,27 @@ static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,
kernel_alloc_flags |= GFP_HIGHUSER;
// For allocation sizes higher than PAGE_SIZE, use __GFP_NORETRY in
// order to avoid higher allocation latency from the kernel compacting
// memory to satisfy the request.
// For allocation sizes higher than PAGE_SIZE, use __GFP_NORETRY in order
// to avoid higher allocation latency from the kernel compacting memory to
// satisfy the request.
// Use __GFP_NOWARN to avoid printing allocation failure to the kernel log.
// High order allocation failures are handled gracefully by the caller.
if (alloc_size > PAGE_SIZE)
kernel_alloc_flags |= __GFP_COMP | __GFP_NORETRY;
kernel_alloc_flags |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
if (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO)
kernel_alloc_flags |= __GFP_ZERO;
page = alloc_pages(kernel_alloc_flags, get_order(alloc_size));
if (page && (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO))
SetPageDirty(page);
UVM_ASSERT(nid < num_online_nodes());
if (nid == NUMA_NO_NODE)
page = alloc_pages(kernel_alloc_flags, get_order(alloc_size));
else
page = alloc_pages_node(nid, kernel_alloc_flags, get_order(alloc_size));
if (page) {
if (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO)
SetPageDirty(page);
}
return page;
}
@@ -805,6 +815,7 @@ static uvm_cpu_physical_chunk_t *uvm_cpu_chunk_create(uvm_chunk_size_t alloc_siz
NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
uvm_cpu_chunk_alloc_flags_t alloc_flags,
int nid,
uvm_cpu_chunk_t **new_chunk)
{
uvm_cpu_physical_chunk_t *chunk;
@@ -812,7 +823,7 @@ NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
UVM_ASSERT(new_chunk);
page = uvm_cpu_chunk_alloc_page(alloc_size, alloc_flags);
page = uvm_cpu_chunk_alloc_page(alloc_size, nid, alloc_flags);
if (!page)
return NV_ERR_NO_MEMORY;
@@ -847,6 +858,13 @@ NV_STATUS uvm_cpu_chunk_alloc_hmm(struct page *page,
return NV_OK;
}
int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk)
{
UVM_ASSERT(chunk);
UVM_ASSERT(chunk->page);
return page_to_nid(chunk->page);
}
NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks)
{
NV_STATUS status = NV_OK;

View File

@@ -304,11 +304,24 @@ uvm_chunk_sizes_mask_t uvm_cpu_chunk_get_allocation_sizes(void);
// Allocate a physical CPU chunk of the specified size.
//
// The nid argument is used to indicate a memory node preference. If the
// value is a memory node ID, the chunk allocation will be attempted on
// that memory node. If the chunk cannot be allocated on that memory node,
// it will be allocated on any memory node allowed by the process's policy.
//
// If the value of nid is a memory node ID that is not in the set of
// current process's allowed memory nodes, it will be allocated on one of the
// nodes in the allowed set.
//
// If the value of nid is NUMA_NO_NODE, the chunk will be allocated from any
// of the allowed memory nodes by the process policy.
//
// If a CPU chunk allocation succeeds, NV_OK is returned. new_chunk will be set
// to point to the newly allocated chunk. On failure, NV_ERR_NO_MEMORY is
// returned.
NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
uvm_cpu_chunk_alloc_flags_t flags,
int nid,
uvm_cpu_chunk_t **new_chunk);
// Allocate a HMM CPU chunk.
@@ -375,6 +388,9 @@ static uvm_cpu_logical_chunk_t *uvm_cpu_chunk_to_logical(uvm_cpu_chunk_t *chunk)
return container_of((chunk), uvm_cpu_logical_chunk_t, common);
}
// Return the NUMA node ID of the physical page backing the chunk.
int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk);
// Free a CPU chunk.
// This may not result in the immediate freeing of the physical pages of the
// chunk if this is a logical chunk and there are other logical chunks holding

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2017-2019 NVIDIA Corporation
Copyright (c) 2017-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -664,6 +664,7 @@ done:
static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
uvm_cpu_chunk_alloc_flags_t flags,
int nid,
uvm_cpu_chunk_t **out_chunk)
{
uvm_cpu_chunk_t *chunk;
@@ -675,7 +676,7 @@ static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
// It is possible that the allocation fails due to lack of large pages
// rather than an API issue, which will result in a false negative.
// However, that should be very rare.
TEST_NV_CHECK_RET(uvm_cpu_chunk_alloc(size, flags, &chunk));
TEST_NV_CHECK_RET(uvm_cpu_chunk_alloc(size, flags, nid, &chunk));
// Check general state of the chunk:
// - chunk should be a physical chunk,
@@ -685,6 +686,12 @@ static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(chunk) == size, done);
TEST_CHECK_GOTO(uvm_cpu_chunk_num_pages(chunk) == size / PAGE_SIZE, done);
// It is possible for the kernel to allocate a chunk on a NUMA node other
// than the one requested. However, that should not be an issue with
// sufficient memory on each NUMA node.
if (nid != NUMA_NO_NODE)
TEST_CHECK_GOTO(uvm_cpu_chunk_get_numa_node(chunk) == nid, done);
if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO) {
NvU64 *cpu_addr;
@@ -719,7 +726,7 @@ static NV_STATUS test_cpu_chunk_mapping_basic_verify(uvm_gpu_t *gpu,
NvU64 dma_addr;
NV_STATUS status = NV_OK;
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, flags, &chunk));
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, flags, NUMA_NO_NODE, &chunk));
phys_chunk = uvm_cpu_chunk_to_physical(chunk);
// Check state of the physical chunk:
@@ -763,27 +770,27 @@ static NV_STATUS test_cpu_chunk_mapping_basic(uvm_gpu_t *gpu, uvm_cpu_chunk_allo
return NV_OK;
}
static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2, uvm_gpu_t *gpu3)
static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
{
NV_STATUS status = NV_OK;
uvm_cpu_chunk_t *chunk;
uvm_cpu_physical_chunk_t *phys_chunk;
NvU64 dma_addr_gpu2;
NvU64 dma_addr_gpu1;
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
phys_chunk = uvm_cpu_chunk_to_physical(chunk);
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu3), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu3), done);
dma_addr_gpu2 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent);
uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu3->parent);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
dma_addr_gpu1 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1->parent);
uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu2->parent);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
// DMA mapping addresses for different GPUs live in different IOMMU spaces,
// so it would be perfectly legal for them to have the same IOVA, and even
@@ -793,7 +800,7 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2,
// GPU1. It's true that we may get a false negative if both addresses
// happened to alias and we had a bug in how the addresses are shifted in
// the dense array, but that's better than intermittent failure.
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent) == dma_addr_gpu2, done);
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1->parent) == dma_addr_gpu1, done);
done:
uvm_cpu_chunk_free(chunk);
@@ -911,7 +918,7 @@ static NV_STATUS test_cpu_chunk_split_and_merge(uvm_gpu_t *gpu)
uvm_cpu_chunk_t *chunk;
NV_STATUS status;
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
status = do_test_cpu_chunk_split_and_merge(chunk, gpu);
uvm_cpu_chunk_free(chunk);
@@ -993,7 +1000,7 @@ static NV_STATUS test_cpu_chunk_dirty(uvm_gpu_t *gpu)
uvm_cpu_physical_chunk_t *phys_chunk;
size_t num_pages;
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
phys_chunk = uvm_cpu_chunk_to_physical(chunk);
num_pages = uvm_cpu_chunk_num_pages(chunk);
@@ -1005,7 +1012,7 @@ static NV_STATUS test_cpu_chunk_dirty(uvm_gpu_t *gpu)
uvm_cpu_chunk_free(chunk);
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO, &chunk));
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO, NUMA_NO_NODE, &chunk));
phys_chunk = uvm_cpu_chunk_to_physical(chunk);
num_pages = uvm_cpu_chunk_num_pages(chunk);
@@ -1170,13 +1177,35 @@ NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space, uvm_processor_mask_t *te
size_t size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);
for_each_chunk_size_from(size, alloc_sizes) {
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
TEST_NV_CHECK_RET(do_test_cpu_chunk_free(chunk, va_space, test_gpus));
}
return NV_OK;
}
static NV_STATUS test_cpu_chunk_numa_alloc(uvm_va_space_t *va_space)
{
uvm_cpu_chunk_t *chunk;
uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
size_t size;
for_each_chunk_size(size, alloc_sizes) {
int nid;
for_each_possible_uvm_node(nid) {
// Do not test CPU allocation on nodes that have no memory or CPU
if (!node_state(nid, N_MEMORY) || !node_state(nid, N_CPU))
continue;
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, nid, &chunk));
uvm_cpu_chunk_free(chunk);
}
}
return NV_OK;
}
NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
{
uvm_va_space_t *va_space = uvm_va_space_get(filp);
@@ -1197,6 +1226,7 @@ NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct f
}
TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, &test_gpus), done);
TEST_NV_CHECK_GOTO(test_cpu_chunk_numa_alloc(va_space), done);
if (uvm_processor_mask_get_gpu_count(&test_gpus) >= 3) {
uvm_gpu_t *gpu2, *gpu3;

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2023 NVIDIA Corporation
Copyright (c) 2015-2022 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -324,7 +324,7 @@ static NV_STATUS gpu_mem_check(uvm_gpu_t *gpu,
// TODO: Bug 3839176: [UVM][HCC][uvm_test] Update tests that assume GPU
// engines can directly access sysmem
// Skip this test for now. To enable this test in Confidential Computing,
// Skip this test for now. To enable this test under SEV,
// The GPU->CPU CE copy needs to be updated so it uses encryption when
// CC is enabled.
if (uvm_conf_computing_mode_enabled(gpu))
@@ -1068,7 +1068,7 @@ static NV_STATUS test_pmm_reverse_map_single(uvm_gpu_t *gpu, uvm_va_space_t *va_
uvm_mutex_lock(&va_block->lock);
is_resident = uvm_processor_mask_test(&va_block->resident, gpu->id) &&
uvm_page_mask_full(uvm_va_block_resident_mask_get(va_block, gpu->id));
uvm_page_mask_full(uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE));
if (is_resident)
phys_addr = uvm_va_block_gpu_phys_page_address(va_block, 0, gpu);
@@ -1154,7 +1154,7 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t
uvm_mutex_lock(&va_block->lock);
// Verify that all pages are populated on the GPU
is_resident = uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, gpu->id),
is_resident = uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE),
reverse_mapping->region);
uvm_mutex_unlock(&va_block->lock);
@@ -1223,6 +1223,8 @@ static NV_STATUS test_indirect_peers(uvm_gpu_t *owning_gpu, uvm_gpu_t *accessing
if (!chunks)
return NV_ERR_NO_MEMORY;
UVM_ASSERT(!g_uvm_global.sev_enabled);
TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_CHUNK_SIZE_MAX, current->mm, &verif_mem), out);
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, owning_gpu), out);
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, accessing_gpu), out);

View File

@@ -176,7 +176,9 @@ static NV_STATUS preferred_location_unmap_remote_pages(uvm_va_block_t *va_block,
mapped_mask = uvm_va_block_map_mask_get(va_block, preferred_location);
if (uvm_processor_mask_test(&va_block->resident, preferred_location)) {
const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location);
const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block,
preferred_location,
NUMA_NO_NODE);
if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask, mapped_mask, resident_mask))
goto done;
@@ -638,7 +640,7 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,
for_each_id_in_mask(src_id, &va_block->resident) {
NV_STATUS status;
uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id);
uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
// Calling uvm_va_block_make_resident_read_duplicate will break all
// SetAccessedBy and remote mappings
@@ -695,7 +697,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
// If preferred_location is set and has resident copies, give it preference
if (UVM_ID_IS_VALID(preferred_location) &&
uvm_processor_mask_test(&va_block->resident, preferred_location)) {
uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location);
uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location, NUMA_NO_NODE);
bool is_mask_empty = !uvm_page_mask_and(break_read_duplication_pages,
&va_block->read_duplicated_pages,
resident_mask);
@@ -723,7 +725,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
if (uvm_id_equal(processor_id, preferred_location))
continue;
resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id);
resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);
is_mask_empty = !uvm_page_mask_and(break_read_duplication_pages,
&va_block->read_duplicated_pages,
resident_mask);

View File

@@ -0,0 +1,40 @@
/*******************************************************************************
Copyright (c) 2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*******************************************************************************/
#include "uvm_processors.h"
int uvm_find_closest_node_mask(int src, const nodemask_t *mask)
{
int nid;
int closest_nid = NUMA_NO_NODE;
if (node_isset(src, *mask))
return src;
for_each_set_bit(nid, mask->bits, MAX_NUMNODES) {
if (closest_nid == NUMA_NO_NODE || node_distance(src, nid) < node_distance(src, closest_nid))
closest_nid = nid;
}
return closest_nid;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2019 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -26,6 +26,7 @@
#include "uvm_linux.h"
#include "uvm_common.h"
#include <linux/numa.h>
#define UVM_MAX_UNIQUE_GPU_PAIRS SUM_FROM_0_TO_N(UVM_MAX_GPUS - 1)
@@ -37,11 +38,11 @@
// provide type safety, they are wrapped within the uvm_processor_id_t struct.
// The range of valid identifiers needs to cover the maximum number of
// supported GPUs on a system plus the CPU. CPU is assigned value 0, and GPUs
// range: [1, UVM_ID_MAX_GPUS].
// range: [1, UVM_PARENT_ID_MAX_GPUS].
//
// There are some functions that only expect GPU identifiers and, in order to
// make it clearer, the uvm_gpu_id_t alias type is provided. However, as this
// type is just a typedef of uvm_processor_id_t, there is no type checking
// make it clearer, the uvm_parent_gpu_id_t alias type is provided. However, as
// this type is just a typedef of uvm_processor_id_t, there is no type checking
// performed by the compiler.
//
// Identifier value vs index
@@ -60,22 +61,25 @@
// the GPU within the GPU id space (basically id - 1).
//
// In the diagram below, MAX_SUB is used to abbreviate
// UVM_ID_MAX_SUB_PROCESSORS.
// UVM_PARENT_ID_MAX_SUB_PROCESSORS.
//
// |-------------------------- uvm_processor_id_t ----------------------|
// | |
// | |----------------------- uvm_gpu_id_t ------------------------||
// | | ||
// Proc type | CPU | GPU ... GPU ... GPU ||
// | | ||
// ID values | 0 | 1 ... i+1 ... UVM_ID_MAX_PROCESSORS-1 ||
// TODO: Bug 4195538: uvm_parent_processor_id_t is currently but temporarily the
// same as uvm_processor_id_t.
//
// GPU index 0 ... i ... UVM_ID_MAX_GPUS-1
// |-------------------------- uvm_parent_processor_id_t ----------------------|
// | |
// | |----------------------- uvm_parent_gpu_id_t ------------------------||
// | | ||
// Proc type | CPU | GPU ... GPU ... GPU ||
// | | ||
// ID values | 0 | 1 ... i+1 ... UVM_PARENT_ID_MAX_PROCESSORS-1 ||
//
// GPU index 0 ... i ... UVM_PARENT_ID_MAX_GPUS-1
// | | | |
// | | | |
// | |-------------| | |-----------------------------|
// | | | |
// | | | |
// | |-------------| | |------------------------------------|
// | | | |
// | | | |
// GPU index 0 ... MAX_SUB-1 ... i*MAX_SUB ... (i+1)*MAX_SUB-1 ... UVM_GLOBAL_ID_MAX_GPUS-1
//
// ID values | 0 | 1 ... MAX_SUB ... (i*MAX_SUB)+1 ... (i+1)*MAX_SUB ... UVM_GLOBAL_ID_MAX_PROCESSORS-1 ||
@@ -210,7 +214,7 @@ static proc_id_t prefix_fn_mask##_find_first_id(const mask_t *mask)
\
static proc_id_t prefix_fn_mask##_find_first_gpu_id(const mask_t *mask) \
{ \
return proc_id_ctor(find_next_bit(mask->bitmap, (maxval), UVM_ID_GPU0_VALUE)); \
return proc_id_ctor(find_next_bit(mask->bitmap, (maxval), UVM_PARENT_ID_GPU0_VALUE)); \
} \
\
static proc_id_t prefix_fn_mask##_find_next_id(const mask_t *mask, proc_id_t min_id) \
@@ -252,7 +256,7 @@ static NvU32 prefix_fn_mask##_get_gpu_count(const mask_t *mask)
{ \
NvU32 gpu_count = prefix_fn_mask##_get_count(mask); \
\
if (prefix_fn_mask##_test(mask, proc_id_ctor(UVM_ID_CPU_VALUE))) \
if (prefix_fn_mask##_test(mask, proc_id_ctor(UVM_PARENT_ID_CPU_VALUE))) \
--gpu_count; \
\
return gpu_count; \
@@ -261,55 +265,55 @@ static NvU32 prefix_fn_mask##_get_gpu_count(const mask_t *mask)
typedef struct
{
NvU32 val;
} uvm_processor_id_t;
} uvm_parent_processor_id_t;
typedef struct
{
NvU32 val;
} uvm_global_processor_id_t;
typedef uvm_processor_id_t uvm_gpu_id_t;
typedef uvm_parent_processor_id_t uvm_parent_gpu_id_t;
typedef uvm_global_processor_id_t uvm_global_gpu_id_t;
// Static value assigned to the CPU
#define UVM_ID_CPU_VALUE 0
#define UVM_ID_GPU0_VALUE (UVM_ID_CPU_VALUE + 1)
#define UVM_PARENT_ID_CPU_VALUE 0
#define UVM_PARENT_ID_GPU0_VALUE (UVM_PARENT_ID_CPU_VALUE + 1)
// ID values for the CPU and first GPU, respectively; the values for both types
// of IDs must match to enable sharing of UVM_PROCESSOR_MASK().
#define UVM_GLOBAL_ID_CPU_VALUE UVM_ID_CPU_VALUE
#define UVM_GLOBAL_ID_GPU0_VALUE UVM_ID_GPU0_VALUE
#define UVM_GLOBAL_ID_CPU_VALUE UVM_PARENT_ID_CPU_VALUE
#define UVM_GLOBAL_ID_GPU0_VALUE UVM_PARENT_ID_GPU0_VALUE
// Maximum number of GPUs/processors that can be represented with the id types
#define UVM_ID_MAX_GPUS UVM_MAX_GPUS
#define UVM_ID_MAX_PROCESSORS UVM_MAX_PROCESSORS
#define UVM_PARENT_ID_MAX_GPUS UVM_MAX_GPUS
#define UVM_PARENT_ID_MAX_PROCESSORS UVM_MAX_PROCESSORS
#define UVM_ID_MAX_SUB_PROCESSORS 8
#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8
#define UVM_GLOBAL_ID_MAX_GPUS (UVM_MAX_GPUS * UVM_ID_MAX_SUB_PROCESSORS)
#define UVM_GLOBAL_ID_MAX_GPUS (UVM_PARENT_ID_MAX_GPUS * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
#define UVM_GLOBAL_ID_MAX_PROCESSORS (UVM_GLOBAL_ID_MAX_GPUS + 1)
#define UVM_ID_CPU ((uvm_processor_id_t) { .val = UVM_ID_CPU_VALUE })
#define UVM_ID_INVALID ((uvm_processor_id_t) { .val = UVM_ID_MAX_PROCESSORS })
#define UVM_PARENT_ID_CPU ((uvm_parent_processor_id_t) { .val = UVM_PARENT_ID_CPU_VALUE })
#define UVM_PARENT_ID_INVALID ((uvm_parent_processor_id_t) { .val = UVM_PARENT_ID_MAX_PROCESSORS })
#define UVM_GLOBAL_ID_CPU ((uvm_global_processor_id_t) { .val = UVM_GLOBAL_ID_CPU_VALUE })
#define UVM_GLOBAL_ID_INVALID ((uvm_global_processor_id_t) { .val = UVM_GLOBAL_ID_MAX_PROCESSORS })
#define UVM_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_ID_MAX_PROCESSORS, "id %u\n", id.val)
#define UVM_PARENT_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_PARENT_ID_MAX_PROCESSORS, "id %u\n", id.val)
#define UVM_GLOBAL_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_GLOBAL_ID_MAX_PROCESSORS, "id %u\n", id.val)
static int uvm_id_cmp(uvm_processor_id_t id1, uvm_processor_id_t id2)
static int uvm_parent_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
{
UVM_ID_CHECK_BOUNDS(id1);
UVM_ID_CHECK_BOUNDS(id2);
UVM_PARENT_ID_CHECK_BOUNDS(id1);
UVM_PARENT_ID_CHECK_BOUNDS(id2);
return UVM_CMP_DEFAULT(id1.val, id2.val);
}
static bool uvm_id_equal(uvm_processor_id_t id1, uvm_processor_id_t id2)
static bool uvm_parent_id_equal(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
{
UVM_ID_CHECK_BOUNDS(id1);
UVM_ID_CHECK_BOUNDS(id2);
UVM_PARENT_ID_CHECK_BOUNDS(id1);
UVM_PARENT_ID_CHECK_BOUNDS(id2);
return id1.val == id2.val;
}
@@ -330,30 +334,30 @@ static bool uvm_global_id_equal(uvm_global_processor_id_t id1, uvm_global_proces
return id1.val == id2.val;
}
#define UVM_ID_IS_CPU(id) uvm_id_equal(id, UVM_ID_CPU)
#define UVM_ID_IS_INVALID(id) uvm_id_equal(id, UVM_ID_INVALID)
#define UVM_ID_IS_VALID(id) (!UVM_ID_IS_INVALID(id))
#define UVM_ID_IS_GPU(id) (!UVM_ID_IS_CPU(id) && !UVM_ID_IS_INVALID(id))
#define UVM_PARENT_ID_IS_CPU(id) uvm_parent_id_equal(id, UVM_PARENT_ID_CPU)
#define UVM_PARENT_ID_IS_INVALID(id) uvm_parent_id_equal(id, UVM_PARENT_ID_INVALID)
#define UVM_PARENT_ID_IS_VALID(id) (!UVM_PARENT_ID_IS_INVALID(id))
#define UVM_PARENT_ID_IS_GPU(id) (!UVM_PARENT_ID_IS_CPU(id) && !UVM_PARENT_ID_IS_INVALID(id))
#define UVM_GLOBAL_ID_IS_CPU(id) uvm_global_id_equal(id, UVM_GLOBAL_ID_CPU)
#define UVM_GLOBAL_ID_IS_INVALID(id) uvm_global_id_equal(id, UVM_GLOBAL_ID_INVALID)
#define UVM_GLOBAL_ID_IS_VALID(id) (!UVM_GLOBAL_ID_IS_INVALID(id))
#define UVM_GLOBAL_ID_IS_GPU(id) (!UVM_GLOBAL_ID_IS_CPU(id) && !UVM_GLOBAL_ID_IS_INVALID(id))
static uvm_processor_id_t uvm_id_from_value(NvU32 val)
static uvm_parent_processor_id_t uvm_parent_id_from_value(NvU32 val)
{
uvm_processor_id_t ret = { .val = val };
uvm_parent_processor_id_t ret = { .val = val };
UVM_ID_CHECK_BOUNDS(ret);
UVM_PARENT_ID_CHECK_BOUNDS(ret);
return ret;
}
static uvm_gpu_id_t uvm_gpu_id_from_value(NvU32 val)
static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_value(NvU32 val)
{
uvm_gpu_id_t ret = uvm_id_from_value(val);
uvm_parent_gpu_id_t ret = uvm_parent_id_from_value(val);
UVM_ASSERT(!UVM_ID_IS_CPU(ret));
UVM_ASSERT(!UVM_PARENT_ID_IS_CPU(ret));
return ret;
}
@@ -376,34 +380,34 @@ static uvm_global_gpu_id_t uvm_global_gpu_id_from_value(NvU32 val)
return ret;
}
// Create a GPU id from the given GPU id index (previously obtained via
// uvm_id_gpu_index)
static uvm_gpu_id_t uvm_gpu_id_from_index(NvU32 index)
// Create a parent GPU id from the given parent GPU id index (previously
// obtained via uvm_parent_id_gpu_index)
static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_index(NvU32 index)
{
return uvm_gpu_id_from_value(index + UVM_ID_GPU0_VALUE);
return uvm_parent_gpu_id_from_value(index + UVM_PARENT_ID_GPU0_VALUE);
}
static uvm_processor_id_t uvm_id_next(uvm_processor_id_t id)
static uvm_parent_processor_id_t uvm_parent_id_next(uvm_parent_processor_id_t id)
{
++id.val;
UVM_ID_CHECK_BOUNDS(id);
UVM_PARENT_ID_CHECK_BOUNDS(id);
return id;
}
static uvm_gpu_id_t uvm_gpu_id_next(uvm_gpu_id_t id)
static uvm_parent_gpu_id_t uvm_parent_gpu_id_next(uvm_parent_gpu_id_t id)
{
UVM_ASSERT(UVM_ID_IS_GPU(id));
UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));
++id.val;
UVM_ID_CHECK_BOUNDS(id);
UVM_PARENT_ID_CHECK_BOUNDS(id);
return id;
}
// Same as uvm_gpu_id_from_index but for uvm_global_processor_id_t
// Same as uvm_parent_gpu_id_from_index but for uvm_global_processor_id_t
static uvm_global_gpu_id_t uvm_global_gpu_id_from_index(NvU32 index)
{
return uvm_global_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE);
@@ -429,11 +433,11 @@ static uvm_global_gpu_id_t uvm_global_gpu_id_next(uvm_global_gpu_id_t id)
return id;
}
// This function returns the numerical value within [0, UVM_ID_MAX_PROCESSORS)
// of the given processor id
static NvU32 uvm_id_value(uvm_processor_id_t id)
// This function returns the numerical value within
// [0, UVM_PARENT_ID_MAX_PROCESSORS) of the given parent processor id.
static NvU32 uvm_parent_id_value(uvm_parent_processor_id_t id)
{
UVM_ASSERT(UVM_ID_IS_VALID(id));
UVM_ASSERT(UVM_PARENT_ID_IS_VALID(id));
return id.val;
}
@@ -448,12 +452,12 @@ static NvU32 uvm_global_id_value(uvm_global_processor_id_t id)
}
// This function returns the index of the given GPU id within the GPU id space
// [0, UVM_ID_MAX_GPUS)
static NvU32 uvm_id_gpu_index(uvm_gpu_id_t id)
// [0, UVM_PARENT_ID_MAX_GPUS)
static NvU32 uvm_parent_id_gpu_index(uvm_parent_gpu_id_t id)
{
UVM_ASSERT(UVM_ID_IS_GPU(id));
UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));
return id.val - UVM_ID_GPU0_VALUE;
return id.val - UVM_PARENT_ID_GPU0_VALUE;
}
// This function returns the index of the given GPU id within the GPU id space
@@ -465,61 +469,61 @@ static NvU32 uvm_global_id_gpu_index(const uvm_global_gpu_id_t id)
return id.val - UVM_GLOBAL_ID_GPU0_VALUE;
}
static NvU32 uvm_global_id_gpu_index_from_gpu_id(const uvm_gpu_id_t id)
static NvU32 uvm_global_id_gpu_index_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
{
UVM_ASSERT(UVM_ID_IS_GPU(id));
UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));
return uvm_id_gpu_index(id) * UVM_ID_MAX_SUB_PROCESSORS;
return uvm_parent_id_gpu_index(id) * UVM_PARENT_ID_MAX_SUB_PROCESSORS;
}
static NvU32 uvm_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
static NvU32 uvm_parent_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
{
UVM_ASSERT(UVM_GLOBAL_ID_IS_GPU(id));
return uvm_global_id_gpu_index(id) / UVM_ID_MAX_SUB_PROCESSORS;
return uvm_global_id_gpu_index(id) / UVM_PARENT_ID_MAX_SUB_PROCESSORS;
}
static uvm_global_gpu_id_t uvm_global_gpu_id_from_gpu_id(const uvm_gpu_id_t id)
static uvm_global_gpu_id_t uvm_global_gpu_id_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
{
UVM_ASSERT(UVM_ID_IS_GPU(id));
UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));
return uvm_global_gpu_id_from_index(uvm_global_id_gpu_index_from_gpu_id(id));
return uvm_global_gpu_id_from_index(uvm_global_id_gpu_index_from_parent_gpu_id(id));
}
static uvm_global_gpu_id_t uvm_global_gpu_id_from_parent_index(NvU32 index)
{
UVM_ASSERT(index < UVM_MAX_GPUS);
UVM_ASSERT(index < UVM_PARENT_ID_MAX_GPUS);
return uvm_global_gpu_id_from_gpu_id(uvm_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE));
return uvm_global_gpu_id_from_parent_gpu_id(uvm_parent_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE));
}
static uvm_global_gpu_id_t uvm_global_gpu_id_from_sub_processor_index(const uvm_gpu_id_t id, NvU32 sub_index)
static uvm_global_gpu_id_t uvm_global_gpu_id_from_sub_processor_index(const uvm_parent_gpu_id_t id, NvU32 sub_index)
{
NvU32 index;
UVM_ASSERT(sub_index < UVM_ID_MAX_SUB_PROCESSORS);
UVM_ASSERT(sub_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
index = uvm_global_id_gpu_index_from_gpu_id(id) + sub_index;
index = uvm_global_id_gpu_index_from_parent_gpu_id(id) + sub_index;
return uvm_global_gpu_id_from_index(index);
}
static uvm_gpu_id_t uvm_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
{
UVM_ASSERT(UVM_GLOBAL_ID_IS_GPU(id));
return uvm_gpu_id_from_index(uvm_id_gpu_index_from_global_gpu_id(id));
return uvm_parent_gpu_id_from_index(uvm_parent_id_gpu_index_from_global_gpu_id(id));
}
static NvU32 uvm_global_id_sub_processor_index(const uvm_global_gpu_id_t id)
{
return uvm_global_id_gpu_index(id) % UVM_ID_MAX_SUB_PROCESSORS;
return uvm_global_id_gpu_index(id) % UVM_PARENT_ID_MAX_SUB_PROCESSORS;
}
UVM_PROCESSOR_MASK(uvm_processor_mask_t, \
uvm_processor_mask, \
UVM_ID_MAX_PROCESSORS, \
uvm_processor_id_t, \
uvm_id_from_value)
UVM_PARENT_ID_MAX_PROCESSORS, \
uvm_parent_processor_id_t, \
uvm_parent_id_from_value)
UVM_PROCESSOR_MASK(uvm_global_processor_mask_t, \
uvm_global_processor_mask, \
@@ -533,19 +537,19 @@ static bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset, co
{
uvm_processor_mask_t subset_gpus;
uvm_processor_mask_copy(&subset_gpus, subset);
uvm_processor_mask_clear(&subset_gpus, UVM_ID_CPU);
uvm_processor_mask_clear(&subset_gpus, UVM_PARENT_ID_CPU);
return uvm_processor_mask_subset(&subset_gpus, mask);
}
#define for_each_id_in_mask(id, mask) \
for ((id) = uvm_processor_mask_find_first_id(mask); \
UVM_ID_IS_VALID(id); \
(id) = uvm_processor_mask_find_next_id((mask), uvm_id_next(id)))
UVM_PARENT_ID_IS_VALID(id); \
(id) = uvm_processor_mask_find_next_id((mask), uvm_parent_id_next(id)))
#define for_each_gpu_id_in_mask(gpu_id, mask) \
for ((gpu_id) = uvm_processor_mask_find_first_gpu_id((mask)); \
UVM_ID_IS_VALID(gpu_id); \
(gpu_id) = uvm_processor_mask_find_next_id((mask), uvm_gpu_id_next(gpu_id)))
UVM_PARENT_ID_IS_VALID(gpu_id); \
(gpu_id) = uvm_processor_mask_find_next_id((mask), uvm_parent_gpu_id_next(gpu_id)))
#define for_each_global_id_in_mask(id, mask) \
for ((id) = uvm_global_processor_mask_find_first_id(mask); \
@@ -559,21 +563,36 @@ static bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset, co
// Helper to iterate over all valid gpu ids
#define for_each_gpu_id(i) \
for (i = uvm_gpu_id_from_value(UVM_ID_GPU0_VALUE); UVM_ID_IS_VALID(i); i = uvm_gpu_id_next(i))
for (i = uvm_parent_gpu_id_from_value(UVM_PARENT_ID_GPU0_VALUE); UVM_PARENT_ID_IS_VALID(i); i = uvm_parent_gpu_id_next(i))
#define for_each_global_gpu_id(i) \
for (i = uvm_global_gpu_id_from_value(UVM_GLOBAL_ID_GPU0_VALUE); UVM_GLOBAL_ID_IS_VALID(i); i = uvm_global_gpu_id_next(i))
#define for_each_global_sub_processor_id_in_gpu(id, i) \
for (i = uvm_global_gpu_id_from_gpu_id(id); \
for (i = uvm_global_gpu_id_from_parent_gpu_id(id); \
UVM_GLOBAL_ID_IS_VALID(i) && \
(uvm_global_id_value(i) < uvm_global_id_value(uvm_global_gpu_id_from_gpu_id(id)) + UVM_ID_MAX_SUB_PROCESSORS); \
(uvm_global_id_value(i) < uvm_global_id_value(uvm_global_gpu_id_from_parent_gpu_id(id)) + UVM_PARENT_ID_MAX_SUB_PROCESSORS); \
i = uvm_global_gpu_id_next(i))
// Helper to iterate over all valid gpu ids
#define for_each_processor_id(i) for (i = UVM_ID_CPU; UVM_ID_IS_VALID(i); i = uvm_id_next(i))
#define for_each_processor_id(i) for (i = UVM_PARENT_ID_CPU; UVM_PARENT_ID_IS_VALID(i); i = uvm_parent_id_next(i))
#define for_each_global_id(i) for (i = UVM_GLOBAL_ID_CPU; UVM_GLOBAL_ID_IS_VALID(i); i = uvm_global_id_next(i))
// Find the node in mask with the shorted distance (as returned by
// node_distance) for src.
// Note that the search is inclusive of src.
// If mask has no bits set, NUMA_NO_NODE is returned.
int uvm_find_closest_node_mask(int src, const nodemask_t *mask);
// Iterate over all nodes in mask with increasing distance from src.
// Note that this iterator is destructive of the mask.
#define for_each_closest_uvm_node(nid, src, mask) \
for ((nid) = uvm_find_closest_node_mask((src), &(mask)); \
(nid) != NUMA_NO_NODE; \
node_clear((nid), (mask)), (nid) = uvm_find_closest_node_mask((src), &(mask)))
#define for_each_possible_uvm_node(nid) for_each_node_mask((nid), node_possible_map)
static bool uvm_processor_uuid_eq(const NvProcessorUuid *uuid1, const NvProcessorUuid *uuid2)
{
return memcmp(uuid1, uuid2, sizeof(*uuid1)) == 0;
@@ -585,4 +604,78 @@ static void uvm_processor_uuid_copy(NvProcessorUuid *dst, const NvProcessorUuid
memcpy(dst, src, sizeof(*dst));
}
// TODO: Bug 4195538: [uvm][multi-SMC] Get UVM internal data structures ready to
// meet multi-SMC requirements. Temporary aliases, they must be removed once
// the data structures are converted.
typedef uvm_parent_processor_id_t uvm_processor_id_t;
typedef uvm_parent_gpu_id_t uvm_gpu_id_t;
#define UVM_ID_CPU_VALUE UVM_PARENT_ID_CPU_VALUE
#define UVM_ID_GPU0_VALUE UVM_PARENT_ID_GPU0_VALUE
#define UVM_ID_MAX_GPUS UVM_PARENT_ID_MAX_GPUS
#define UVM_ID_MAX_PROCESSORS UVM_PARENT_ID_MAX_PROCESSORS
#define UVM_ID_MAX_SUB_PROCESSORS UVM_PARENT_ID_MAX_SUB_PROCESSORS
#define UVM_ID_CPU UVM_PARENT_ID_CPU
#define UVM_ID_INVALID UVM_PARENT_ID_INVALID
static int uvm_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
{
return UVM_CMP_DEFAULT(id1.val, id2.val);
}
static bool uvm_id_equal(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
{
return uvm_parent_id_equal(id1, id2);
}
#define UVM_ID_IS_CPU(id) uvm_id_equal(id, UVM_ID_CPU)
#define UVM_ID_IS_INVALID(id) uvm_id_equal(id, UVM_ID_INVALID)
#define UVM_ID_IS_VALID(id) (!UVM_ID_IS_INVALID(id))
#define UVM_ID_IS_GPU(id) (!UVM_ID_IS_CPU(id) && !UVM_ID_IS_INVALID(id))
static uvm_parent_gpu_id_t uvm_gpu_id_from_value(NvU32 val)
{
return uvm_parent_gpu_id_from_value(val);
}
static NvU32 uvm_id_value(uvm_parent_processor_id_t id)
{
return uvm_parent_id_value(id);
}
static NvU32 uvm_id_gpu_index(uvm_parent_gpu_id_t id)
{
return uvm_parent_id_gpu_index(id);
}
static NvU32 uvm_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
{
return uvm_parent_id_gpu_index_from_global_gpu_id(id);
}
static uvm_parent_gpu_id_t uvm_gpu_id_from_index(NvU32 index)
{
return uvm_parent_gpu_id_from_index(index);
}
static uvm_parent_gpu_id_t uvm_gpu_id_next(uvm_parent_gpu_id_t id)
{
return uvm_parent_gpu_id_next(id);
}
static uvm_parent_gpu_id_t uvm_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
{
return uvm_parent_gpu_id_from_global_gpu_id(id);
}
static NvU32 uvm_global_id_gpu_index_from_gpu_id(const uvm_parent_gpu_id_t id)
{
return uvm_global_id_gpu_index_from_parent_gpu_id(id);
}
static uvm_global_gpu_id_t uvm_global_gpu_id_from_gpu_id(const uvm_parent_gpu_id_t id)
{
return uvm_global_gpu_id_from_parent_gpu_id(id);
}
#endif

View File

@@ -106,26 +106,6 @@ static NV_STATUS uvm_test_nv_kthread_q(UVM_TEST_NV_KTHREAD_Q_PARAMS *params, str
return NV_ERR_INVALID_STATE;
}
static NV_STATUS uvm_test_numa_get_closest_cpu_node_to_gpu(UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU_PARAMS *params,
struct file *filp)
{
uvm_gpu_t *gpu;
NV_STATUS status;
uvm_rm_user_object_t user_rm_va_space = {
.rm_control_fd = -1,
.user_client = params->client,
.user_object = params->smc_part_ref
};
status = uvm_gpu_retain_by_uuid(&params->gpu_uuid, &user_rm_va_space, &gpu);
if (status != NV_OK)
return status;
params->node_id = gpu->parent->closest_cpu_numa_node;
uvm_gpu_release(gpu);
return NV_OK;
}
// Callers of this function should ensure that node is not NUMA_NO_NODE in order
// to avoid overrunning the kernel's node to cpumask map.
static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
@@ -307,8 +287,6 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_DRAIN_REPLAYABLE_FAULTS, uvm_test_drain_replayable_faults);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMA_GET_BATCH_SIZE, uvm_test_pma_get_batch_size);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_QUERY_PMA_STATS, uvm_test_pmm_query_pma_stats);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU,
uvm_test_numa_get_closest_cpu_node_to_gpu);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_NUMA_CHECK_AFFINITY, uvm_test_numa_check_affinity);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_ADD_DUMMY_THREAD_CONTEXTS,
uvm_test_va_space_add_dummy_thread_contexts);

View File

@@ -561,6 +561,22 @@ typedef struct
// user_pages_allocation_retry_force_count, but the injection point simulates
// driver metadata allocation failure.
//
// cpu_chunk_allocation_target_id and cpu_chunk_allocation_actual_id are used
// to control the NUMA node IDs for CPU chunk allocations, specifically for
// testing overlapping CPU chunk allocations.
//
// Currently, uvm_api_migrate() does not pass the preferred CPU NUMA node to for
// managed memory so it is not possible to request a specific node.
// cpu_chunk_allocation_target_id is used to request the allocation be made on
// specific node. On the other hand, cpu_chunk_allocation_actual_id is the node
// on which the allocation will actually be made.
//
// The two parameters can be used to force a CPU chunk allocation to overlap a
// previously allocated chunk.
//
// Please note that even when specifying cpu_cpu_allocation_actual_id, the
// kernel may end up allocating on a different node.
//
// Error returns:
// NV_ERR_INVALID_ADDRESS
// - lookup_address doesn't match a UVM range
@@ -571,6 +587,8 @@ typedef struct
NvU32 page_table_allocation_retry_force_count; // In
NvU32 user_pages_allocation_retry_force_count; // In
NvU32 cpu_chunk_allocation_size_mask; // In
NvS32 cpu_chunk_allocation_target_id; // In
NvS32 cpu_chunk_allocation_actual_id; // In
NvU32 cpu_pages_allocation_error_count; // In
NvBool eviction_error; // In
NvBool populate_error; // In
@@ -604,6 +622,10 @@ typedef struct
NvProcessorUuid resident_on[UVM_MAX_PROCESSORS]; // Out
NvU32 resident_on_count; // Out
// If the memory is resident on the CPU, the NUMA node on which the page
// is resident. Otherwise, -1.
NvS32 resident_nid; // Out
// The size of the physical allocation backing lookup_address. Only the
// system-page-sized portion of this allocation which contains
// lookup_address is guaranteed to be resident on the corresponding
@@ -1168,19 +1190,6 @@ typedef struct
NV_STATUS rmStatus; // Out
} UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS;
#define UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU UVM_TEST_IOCTL_BASE(77)
typedef struct
{
NvProcessorUuid gpu_uuid; // In
NvHandle client; // In
NvHandle smc_part_ref; // In
// On kernels with NUMA support, this entry contains the closest CPU NUMA
// node to this GPU. Otherwise, the value will be -1.
NvS32 node_id; // Out
NV_STATUS rmStatus; // Out
} UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU_PARAMS;
// Test whether the bottom halves have run on the correct CPUs based on the
// NUMA node locality of the GPU.
//

File diff suppressed because it is too large Load Diff

View File

@@ -44,6 +44,7 @@
#include <linux/mmu_notifier.h>
#include <linux/wait.h>
#include <linux/nodemask.h>
// VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations
// (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED):
@@ -229,6 +230,42 @@ typedef struct
} uvm_va_block_gpu_state_t;
typedef struct
{
// Per-page residency bit vector, used for fast traversal of resident
// pages.
//
// A set bit means the CPU has a coherent copy of the physical page
// resident in the NUMA node's memory, and that a CPU chunk for the
// corresponding page index has been allocated. This does not mean that
// the coherent copy is currently mapped anywhere, however. A page may be
// resident on multiple processors (but not multiple CPU NUMA nodes) when in
// read-duplicate mode.
//
// A cleared bit means the CPU NUMA node does not have a coherent copy of
// that page resident. A CPU chunk for the corresponding page index may or
// may not have been allocated. If the chunk is present, it's a cached chunk
// which can be reused in the future.
//
// Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
// smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
// overhead on the whole.
uvm_page_mask_t resident;
// Per-page allocation bit vector.
//
// A set bit means that a CPU chunk has been allocated for the
// corresponding page index on this NUMA node.
uvm_page_mask_t allocated;
// CPU memory chunks represent physically contiguous CPU memory
// allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
// This member is meant to hold an opaque value indicating the CPU
// chunk storage method. For more details on CPU chunk storage,
// see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
unsigned long chunks;
} uvm_va_block_cpu_node_state_t;
// TODO: Bug 1766180: Worst-case we could have one of these per system page.
// Options:
// 1) Rely on the OOM killer to prevent the user from trying to do that
@@ -306,38 +343,30 @@ struct uvm_va_block_struct
struct
{
// Per-page residency bit vector, used for fast traversal of resident
// pages.
//
// A set bit means the CPU has a coherent copy of the physical page
// resident in its memory, and that the corresponding entry in the pages
// array is present. This does not mean that the coherent copy is
// currently mapped anywhere, however. A page may be resident on
// multiple processors when in read-duplicate mode.
//
// A cleared bit means the CPU does not have a coherent copy of that
// page resident. The corresponding entry in the pages array may or may
// not present. If the entry is present, it's a cached page which can be
// reused in the future.
//
// Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
// smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
// overhead on the whole.
uvm_page_mask_t resident;
// CPU memory chunks represent physically contiguous CPU memory
// allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
// This member is meant to hold an opaque value indicating the CPU
// chunk storage method. For more details on CPU chunk storage,
// see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
unsigned long chunks;
// Per-NUMA node tracking of CPU allocations.
// This is a dense array with one entry per possible NUMA node.
uvm_va_block_cpu_node_state_t **node_state;
// Per-page allocation bit vector.
//
// A set bit means that a CPU page has been allocated for the
// corresponding page index.
// corresponding page index on at least one CPU NUMA node.
uvm_page_mask_t allocated;
// Per-page residency bit vector. See
// uvm_va_block_cpu_numa_state_t::resident for a detailed description.
// This mask is a cumulative mask (logical OR) of all
// uvm_va_block_cpu_node_state_t::resident masks. It is meant to be used
// only for fast testing of page residency when it matters only if the
// page is resident on the CPU.
//
// Note that this mask cannot be set directly as this will cause
// inconsistencies between this mask and the per-NUMA residency masks.
// In order to properly maintain consistency between the per-NUMA masks
// and this one, uvm_va_block_cpu_[set|clear]_residency_*() helpers
// should be used.
uvm_page_mask_t resident;
// Per-page mapping bit vectors, one per bit we need to track. These are
// used for fast traversal of valid mappings in the block. These contain
// all non-address bits needed to establish a virtual mapping on this
@@ -418,7 +447,8 @@ struct uvm_va_block_struct
uvm_page_mask_t read_duplicated_pages;
// Mask to keep track of the pages that are not mapped on any non-UVM-Lite
// processor.
// processor. This mask is not used for HMM because the CPU can map pages
// at any time without notifying the driver.
// 0: Page is definitely not mapped by any processors
// 1: Page may or may not be mapped by a processor
//
@@ -525,6 +555,13 @@ struct uvm_va_block_wrapper_struct
// a successful migration if this error flag is cleared.
NvU32 inject_cpu_pages_allocation_error_count;
// A NUMA node ID on which any CPU chunks will be allocated from.
// This will override any other setting and/or policy.
// Note that the kernel is still free to allocate from any of the
// nodes in the thread's policy.
int cpu_chunk_allocation_target_id;
int cpu_chunk_allocation_actual_id;
// Force the next eviction attempt on this block to fail. Used for
// testing only.
bool inject_eviction_error;
@@ -668,17 +705,12 @@ void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context);
// Initialization of an already-allocated uvm_va_block_context_t.
//
// mm is used to initialize the value of va_block_context->mm. NULL is allowed.
static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
{
UVM_ASSERT(va_block_context);
void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm);
// Write garbage into the VA Block context to ensure that the UVM code
// clears masks appropriately
if (UVM_IS_DEBUG())
memset(va_block_context, 0xff, sizeof(*va_block_context));
va_block_context->mm = mm;
}
// Return the preferred NUMA node ID for the block's policy.
// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
// is returned.
int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context);
// TODO: Bug 1766480: Using only page masks instead of a combination of regions
// and page masks could simplify the below APIs and their implementations
@@ -734,6 +766,9 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
// those masks. It is the caller's responsiblity to zero the masks or
// not first.
//
// va_block_context->make_resident.dest_nid is used to guide the NUMA node for
// CPU allocations.
//
// Notably any status other than NV_OK indicates that the block's lock might
// have been unlocked and relocked.
//
@@ -1377,8 +1412,14 @@ static uvm_va_block_test_t *uvm_va_block_get_test(uvm_va_block_t *va_block)
// Get the page residency mask for a processor if it's known to be there.
//
// If the processor is the CPU, the residency mask for the NUMA node ID
// specified by nid will be returned (see
// uvm_va_block_cpu_node_state_t::resident). If nid is NUMA_NO_NODE,
// the cumulative CPU residency mask will be returned (see
// uvm_va_block_t::cpu::resident).
//
// If the processor is a GPU, this will assert that GPU state is indeed present.
uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid);
// Get the page mapped mask for a processor. The returned mask cannot be
// directly modified by the caller
@@ -1386,6 +1427,13 @@ uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_proce
// If the processor is a GPU, this will assert that GPU state is indeed present.
const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
// Return a mask of non-UVM-Lite pages that are unmapped within the given
// region.
// Locking: The block lock must be held.
void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
uvm_va_block_region_t region,
uvm_page_mask_t *out_mask);
// VA block lookup functions. There are a number of permutations which might be
// useful, such as looking up the block from {va_space, va_range} x {addr,
// block index}. The ones implemented here and in uvm_va_range.h support the
@@ -1756,17 +1804,28 @@ static bool uvm_page_mask_full(const uvm_page_mask_t *mask)
return bitmap_full(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
}
static bool uvm_page_mask_and(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
static void uvm_page_mask_fill(uvm_page_mask_t *mask)
{
bitmap_fill(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
}
static bool uvm_page_mask_and(uvm_page_mask_t *mask_out,
const uvm_page_mask_t *mask_in1,
const uvm_page_mask_t *mask_in2)
{
return bitmap_and(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
}
static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out,
const uvm_page_mask_t *mask_in1,
const uvm_page_mask_t *mask_in2)
{
return bitmap_andnot(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
}
static void uvm_page_mask_or(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
static void uvm_page_mask_or(uvm_page_mask_t *mask_out,
const uvm_page_mask_t *mask_in1,
const uvm_page_mask_t *mask_in2)
{
bitmap_or(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
}
@@ -2036,30 +2095,49 @@ uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_blo
uvm_page_index_t page_index,
uvm_processor_id_t processor);
// Mark CPU page page_index as resident on NUMA node specified by nid.
// nid cannot be NUMA_NO_NODE.
void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
// Test if a CPU page is resident on NUMA node nid. If nid is NUMA_NO_NODE,
// the function will return True if the page is resident on any CPU NUMA node.
bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
// Test if all pages in region are resident on NUMA node nid. If nid is
// NUMA_NO_NODE, the function will test if the pages in the region are
// resident on any CPU NUMA node.
bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region);
// Insert a CPU chunk at the given page_index into the va_block.
// Locking: The va_block lock must be held.
NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
uvm_cpu_chunk_t *chunk,
uvm_page_index_t page_index);
NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
// Remove a CPU chunk at the given page_index from the va_block.
// nid cannot be NUMA_NO_NODE.
// Locking: The va_block lock must be held.
void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
uvm_page_index_t page_index);
void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
// Return the CPU chunk at the given page_index from the va_block.
// Return the CPU chunk at the given page_index on the given NUMA node from the
// va_block. nid cannot be NUMA_NO_NODE.
// Locking: The va_block lock must be held.
uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block,
int nid,
uvm_page_index_t page_index);
// Return the CPU chunk at the given page_index from the va_block.
// Return the struct page * from the chunk corresponding to the given page_index
// Locking: The va_block lock must be held.
struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block,
uvm_page_index_t page_index);
struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
// Return the struct page * of the resident chunk at the given page_index from
// the va_block. The given page_index must be resident on the CPU.
// Locking: The va_block lock must be held.
struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index);
// Physically map a CPU chunk so it is DMA'able from all registered GPUs.
// nid cannot be NUMA_NO_NODE.
// Locking: The va_block lock must be held.
NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
uvm_cpu_chunk_t *chunk,
uvm_page_index_t page_index);
// Physically unmap a CPU chunk from all registered GPUs.

View File

@@ -30,6 +30,7 @@
#include "uvm_forward_decl.h"
#include <linux/migrate.h>
#include <linux/nodemask.h>
// UVM_VA_BLOCK_BITS is 21, meaning the maximum block size is 2MB. Rationale:
// - 2MB matches the largest Pascal GPU page size so it's a natural fit
@@ -145,6 +146,18 @@ typedef struct
unsigned count;
} uvm_prot_page_mask_array_t[UVM_PROT_MAX - 1];
typedef struct
{
// A per-NUMA-node array of page masks (size num_possible_nodes()) that hold
// the set of CPU pages used by the migration operation.
uvm_page_mask_t **node_masks;
// Node mask used to iterate over the page masks above.
// If a node's bit is set, it means that the page mask given by
// node_to_index() in node_masks has set pages.
nodemask_t nodes;
} uvm_make_resident_page_tracking_t;
// In the worst case some VA block operations require more state than we should
// reasonably store on the stack. Instead, we dynamically allocate VA block
// contexts. These are used for almost all operations on VA blocks.
@@ -159,6 +172,9 @@ typedef struct
// this block_context.
uvm_page_mask_t scratch_page_mask;
// Scratch node mask. This follows the same rules as scratch_page_mask;
nodemask_t scratch_node_mask;
// State used by uvm_va_block_make_resident
struct uvm_make_resident_context_struct
{
@@ -181,10 +197,24 @@ typedef struct
// Used to perform ECC checks after the migration is done.
uvm_processor_mask_t all_involved_processors;
// Page mask used to compute the set of CPU pages for each CPU node.
uvm_page_mask_t node_pages_mask;
// Final residency for the data. This is useful for callees to know if
// a migration is part of a staging copy
uvm_processor_id_t dest_id;
// Final residency NUMA node if the migration destination is the CPU.
int dest_nid;
// This structure is used to track CPU pages used for migrations on
// a per-NUMA node basis.
//
// The pages could be used for either migrations to the CPU (used to
// track the destination CPU pages) or staging copies (used to track
// the CPU pages used for the staging).
uvm_make_resident_page_tracking_t cpu_pages_used;
// Event that triggered the call
uvm_make_resident_cause_t cause;
} make_resident;

View File

@@ -31,6 +31,7 @@
const uvm_va_policy_t uvm_va_policy_default = {
.preferred_location = UVM_ID_INVALID,
.preferred_nid = NUMA_NO_NODE,
.read_duplication = UVM_READ_DUPLICATION_UNSET,
};

View File

@@ -24,6 +24,7 @@
#ifndef __UVM_VA_POLICY_H__
#define __UVM_VA_POLICY_H__
#include <linux/numa.h>
#include "uvm_linux.h"
#include "uvm_forward_decl.h"
#include "uvm_processors.h"
@@ -62,6 +63,18 @@ struct uvm_va_policy_struct
// This is set to UVM_ID_INVALID if no preferred location is set.
uvm_processor_id_t preferred_location;
// If the preferred location is the CPU, this is either the preferred NUMA
// node ID or NUMA_NO_NODE to indicate that there is no preference among
// nodes.
// If preferred_location is a GPU, preferred_nid will be used if CPU
// pages have to be allocated for any staging copies. Otherwise, it is
// not used.
//
// TODO: Bug 4148100 - Preferred_location and preferred_nid should be
// combined into a new type that combines the processor and NUMA node
// ID.
int preferred_nid;
// Mask of processors that are accessing this VA range and should have
// their page tables updated to access the (possibly remote) pages.
uvm_processor_mask_t accessed_by;

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2023 NVIDIA Corporation
Copyright (c) 2015-2022 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -162,9 +162,7 @@ static uvm_va_range_t *uvm_va_range_alloc_managed(uvm_va_space_t *va_space, NvU6
goto error;
va_range->type = UVM_VA_RANGE_TYPE_MANAGED;
uvm_va_range_get_policy(va_range)->read_duplication = UVM_READ_DUPLICATION_UNSET;
uvm_va_range_get_policy(va_range)->preferred_location = UVM_ID_INVALID;
va_range->managed.policy = uvm_va_policy_default;
va_range->blocks = uvm_kvmalloc_zero(uvm_va_range_num_blocks(va_range) * sizeof(va_range->blocks[0]));
if (!va_range->blocks) {
@@ -376,7 +374,7 @@ NV_STATUS uvm_va_range_create_semaphore_pool(uvm_va_space_t *va_space,
if (status != NV_OK)
goto error;
if (i == 0 && g_uvm_global.conf_computing_enabled)
if (i == 0 && g_uvm_global.sev_enabled)
mem_alloc_params.dma_owner = gpu;
if (attrs.is_cacheable) {
@@ -835,7 +833,7 @@ static void uvm_va_range_disable_peer_external(uvm_va_range_t *va_range,
range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
uvm_mutex_lock(&range_tree->lock);
uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, mapping_gpu) {
if (ext_map->owning_gpu == owning_gpu && !ext_map->is_sysmem) {
if (ext_map->owning_gpu == owning_gpu && (!ext_map->is_sysmem || ext_map->is_egm)) {
UVM_ASSERT(deferred_free_list);
uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list);
}
@@ -1807,7 +1805,7 @@ NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params,
if (params->gpuAttributesCount > UVM_MAX_GPUS)
return NV_ERR_INVALID_ARGUMENT;
if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0)
if (g_uvm_global.sev_enabled && params->gpuAttributesCount == 0)
return NV_ERR_INVALID_ARGUMENT;
// The mm needs to be locked in order to remove stale HMM va_blocks.

View File

@@ -189,6 +189,7 @@ typedef struct
// sysmem was originally allocated under. For the allocation to remain valid
// we need to prevent the GPU from going away, similarly to P2P mapped
// memory.
// Similarly for EGM memory.
//
// This field is not used for sparse mappings as they don't have an
// allocation and, hence, owning GPU.
@@ -208,6 +209,9 @@ typedef struct
// backing.
bool is_sysmem;
// EGM memory. If true is_sysmem also has to be true and owning_gpu
// has to be valid.
bool is_egm;
// GPU page tables mapping the allocation
uvm_page_table_range_vec_t pt_range_vec;

View File

@@ -222,6 +222,12 @@ NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va
uvm_down_write_mmap_lock(current->mm);
uvm_va_space_down_write(va_space);
va_space->va_block_context = uvm_va_block_context_alloc(NULL);
if (!va_space->va_block_context) {
status = NV_ERR_NO_MEMORY;
goto fail;
}
status = uvm_perf_init_va_space_events(va_space, &va_space->perf_events);
if (status != NV_OK)
goto fail;
@@ -258,6 +264,7 @@ NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va
fail:
uvm_perf_heuristics_unload(va_space);
uvm_perf_destroy_va_space_events(&va_space->perf_events);
uvm_va_block_context_free(va_space->va_block_context);
uvm_va_space_up_write(va_space);
uvm_up_write_mmap_lock(current->mm);
@@ -457,8 +464,6 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
uvm_va_range_destroy(va_range, &deferred_free_list);
}
uvm_hmm_va_space_destroy(va_space);
uvm_range_group_radix_tree_destroy(va_space);
// Unregister all GPUs in the VA space. Note that this does not release the
@@ -466,11 +471,17 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
for_each_va_space_gpu(gpu, va_space)
unregister_gpu(va_space, gpu, NULL, &deferred_free_list, NULL);
uvm_hmm_va_space_destroy(va_space);
uvm_perf_heuristics_unload(va_space);
uvm_perf_destroy_va_space_events(&va_space->perf_events);
va_space_remove_dummy_thread_contexts(va_space);
// Destroy the VA space's block context node tracking after all ranges have
// been destroyed as the VA blocks may reference it.
uvm_va_block_context_free(va_space->va_block_context);
uvm_va_space_up_write(va_space);
UVM_ASSERT(uvm_processor_mask_empty(&va_space->registered_gpus));
@@ -688,7 +699,7 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
// Mixing coherent and non-coherent GPUs is not supported
for_each_va_space_gpu(other_gpu, va_space) {
if (uvm_gpu_is_coherent(gpu->parent) != uvm_gpu_is_coherent(other_gpu->parent)) {
if (uvm_parent_gpu_is_coherent(gpu->parent) != uvm_parent_gpu_is_coherent(other_gpu->parent)) {
status = NV_ERR_INVALID_DEVICE;
goto done;
}
@@ -729,7 +740,7 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
processor_mask_array_set(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
}
if (uvm_gpu_is_coherent(gpu->parent)) {
if (uvm_parent_gpu_is_coherent(gpu->parent)) {
processor_mask_array_set(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);
if (gpu->mem_info.numa.enabled) {
@@ -1540,7 +1551,6 @@ static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);
uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu_va_space->gpu->id);
uvm_processor_mask_clear_atomic(&va_space->needs_fault_buffer_flush, gpu_va_space->gpu->id);
va_space->gpu_va_spaces[uvm_id_gpu_index(gpu_va_space->gpu->id)] = NULL;
gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
}
@@ -1610,14 +1620,14 @@ NV_STATUS uvm_va_space_unregister_gpu_va_space(uvm_va_space_t *va_space, const N
return status;
}
bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
{
size_t table_index;
UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu0->id));
UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu1->id));
UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu2->id));
table_index = uvm_gpu_peer_table_index(gpu1->id, gpu2->id);
table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
return !!test_bit(table_index, va_space->enabled_peers);
}
@@ -2073,9 +2083,16 @@ NV_STATUS uvm_service_block_context_init(void)
// Pre-allocate some fault service contexts for the CPU and add them to the global list
while (num_preallocated_contexts-- > 0) {
uvm_service_block_context_t *service_context = uvm_kvmalloc(sizeof(*service_context));
if (!service_context)
return NV_ERR_NO_MEMORY;
service_context->block_context = uvm_va_block_context_alloc(NULL);
if (!service_context->block_context) {
uvm_kvfree(service_context);
return NV_ERR_NO_MEMORY;
}
list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
}
@@ -2089,6 +2106,7 @@ void uvm_service_block_context_exit(void)
// Free fault service contexts for the CPU and add clear the global list
list_for_each_entry_safe(service_context, service_context_tmp, &g_cpu_service_block_context_list,
cpu_fault.service_context_list) {
uvm_va_block_context_free(service_context->block_context);
uvm_kvfree(service_context);
}
INIT_LIST_HEAD(&g_cpu_service_block_context_list);
@@ -2110,8 +2128,17 @@ static uvm_service_block_context_t *service_block_context_cpu_alloc(void)
uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
if (!service_context)
if (!service_context) {
service_context = uvm_kvmalloc(sizeof(*service_context));
service_context->block_context = uvm_va_block_context_alloc(NULL);
if (!service_context->block_context) {
uvm_kvfree(service_context);
service_context = NULL;
}
}
else {
uvm_va_block_context_init(service_context->block_context, NULL);
}
return service_context;
}
@@ -2137,6 +2164,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
NV_STATUS status = uvm_global_get_status();
bool tools_enabled;
bool major_fault = false;
bool is_remote_mm = false;
uvm_service_block_context_t *service_context;
uvm_global_processor_mask_t gpus_to_check_for_ecc;
@@ -2177,7 +2205,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
// mmap_lock held on the CPU fault path, so tell the fault handler to use
// that one. current->mm might differ if we're on the access_process_vm
// (ptrace) path or if another driver is calling get_user_pages.
service_context->block_context.mm = vma->vm_mm;
service_context->block_context->mm = vma->vm_mm;
// The mmap_lock might be held in write mode, but the mode doesn't matter
// for the purpose of lock ordering and we don't rely on it being in write
@@ -2216,25 +2244,32 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
uvm_tools_record_throttling_end(va_space, fault_addr, UVM_ID_CPU);
if (is_hmm) {
// Note that normally we should find a va_block for the faulting
// address because the block had to be created when migrating a
// page to the GPU and a device private PTE inserted into the CPU
// page tables in order for migrate_to_ram() to be called. Not
// finding it means the PTE was remapped to a different virtual
// address with mremap() so create a new va_block if needed.
status = uvm_hmm_va_block_find_create(va_space,
fault_addr,
&service_context->block_context.hmm.vma,
&va_block);
if (status != NV_OK)
break;
if (va_space->va_space_mm.mm == vma->vm_mm) {
// Note that normally we should find a va_block for the faulting
// address because the block had to be created when migrating a
// page to the GPU and a device private PTE inserted into the CPU
// page tables in order for migrate_to_ram() to be called. Not
// finding it means the PTE was remapped to a different virtual
// address with mremap() so create a new va_block if needed.
status = uvm_hmm_va_block_find_create(va_space,
fault_addr,
&service_context->block_context->hmm.vma,
&va_block);
if (status != NV_OK)
break;
UVM_ASSERT(service_context->block_context.hmm.vma == vma);
status = uvm_hmm_migrate_begin(va_block);
if (status != NV_OK)
break;
UVM_ASSERT(service_context->block_context->hmm.vma == vma);
status = uvm_hmm_migrate_begin(va_block);
if (status != NV_OK)
break;
service_context->cpu_fault.vmf = vmf;
service_context->cpu_fault.vmf = vmf;
}
else {
is_remote_mm = true;
status = uvm_hmm_remote_cpu_fault(vmf);
break;
}
}
else {
status = uvm_va_block_find_create_managed(va_space, fault_addr, &va_block);
@@ -2265,7 +2300,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
tools_enabled = va_space->tools.enabled;
if (status == NV_OK) {
if (status == NV_OK && !is_remote_mm) {
uvm_va_space_global_gpus_in_mask(va_space,
&gpus_to_check_for_ecc,
&service_context->cpu_fault.gpus_to_check_for_ecc);
@@ -2275,7 +2310,7 @@ static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
uvm_va_space_up_read(va_space);
uvm_record_unlock_mmap_lock_read(vma->vm_mm);
if (status == NV_OK) {
if (status == NV_OK && !is_remote_mm) {
status = uvm_global_mask_check_ecc_error(&gpus_to_check_for_ecc);
uvm_global_mask_release(&gpus_to_check_for_ecc);
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2022 NVIDIA Corporation
Copyright (c) 2015-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -253,17 +253,6 @@ struct uvm_va_space_struct
// corrupting state.
uvm_processor_mask_t gpu_unregister_in_progress;
// On VMA destruction, the fault buffer needs to be flushed for all the GPUs
// registered in the VA space to avoid leaving stale entries of the VA range
// that is going to be destroyed. Otherwise, these fault entries can be
// attributed to new VA ranges reallocated at the same addresses. However,
// uvm_vm_close is called with mm->mmap_lock taken and we cannot take the
// ISR lock. Therefore, we use a flag to notify the GPU fault handler that
// the fault buffer needs to be flushed, before servicing the faults that
// belong to the va_space. The bits are set and cleared atomically so no
// va_space lock is required.
uvm_processor_mask_t needs_fault_buffer_flush;
// Mask of processors that are participating in system-wide atomics
uvm_processor_mask_t system_wide_atomics_enabled_processors;
@@ -335,7 +324,7 @@ struct uvm_va_space_struct
// Block context used for GPU unmap operations so that allocation is not
// required on the teardown path. This can only be used while the VA space
// lock is held in write mode. Access using uvm_va_space_block_context().
uvm_va_block_context_t va_block_context;
uvm_va_block_context_t *va_block_context;
NvU64 initialization_flags;
@@ -541,7 +530,7 @@ void uvm_va_space_detach_all_user_channels(uvm_va_space_t *va_space, struct list
// Returns whether peer access between these two GPUs has been enabled in this
// VA space. Both GPUs must be registered in the VA space.
bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2);
bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1);
// Returns the va_space this file points to. Returns NULL if this file
// does not point to a va_space.
@@ -575,8 +564,8 @@ static uvm_va_block_context_t *uvm_va_space_block_context(uvm_va_space_t *va_spa
if (mm)
uvm_assert_mmap_lock_locked(mm);
uvm_va_block_context_init(&va_space->va_block_context, mm);
return &va_space->va_block_context;
uvm_va_block_context_init(va_space->va_block_context, mm);
return va_space->va_block_context;
}
// Retains the GPU VA space memory object. destroy_gpu_va_space and

View File

@@ -215,7 +215,13 @@ bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)
static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
{
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
.invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
#elif defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
.arch_invalidate_secondary_tlbs = uvm_mmu_notifier_invalidate_range_ats,
#else
#error One of invalidate_range/arch_invalid_secondary must be present
#endif
};
static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)
@@ -310,17 +316,6 @@ void uvm_va_space_mm_unregister(uvm_va_space_t *va_space)
if (!va_space_mm->mm)
return;
// At this point the mm is still valid because uvm_mm_release()
// hasn't yet called mmput(). uvm_hmm_va_space_destroy() will kill
// all the va_blocks along with any associated gpu_chunks, so we
// need to make sure these chunks are free. However freeing them
// requires a valid mm so we can call migrate_vma_setup(), so we
// do that here.
// TODO: Bug 3902536: [UVM-HMM] add code to migrate GPU memory
// without having a va_block
if (uvm_hmm_is_enabled(va_space))
uvm_hmm_evict_va_blocks(va_space);
if (uvm_va_space_mm_enabled(va_space)) {
if (UVM_ATS_IBM_SUPPORTED_IN_DRIVER() && g_uvm_global.ats.enabled)
uvm_mmu_notifier_unregister(va_space_mm);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2017-2023 NVIDIA Corporation
Copyright (c) 2017-2021 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -145,10 +145,7 @@ static NvU64 small_half_pde_volta(uvm_mmu_page_table_alloc_t *phys_alloc)
return pde_bits;
}
static void make_pde_volta(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
NvU32 depth,
uvm_page_directory_t *child_dir)
static void make_pde_volta(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
{
NvU32 entry_count = entries_per_index_volta(depth);
NvU64 *entry_bits = (NvU64 *)entry;