535.43.24

This commit is contained in:
russellcnv
2024-01-31 14:02:06 -08:00
parent 2a3b58b8c8
commit e558660fc2
267 changed files with 89045 additions and 82824 deletions

View File

@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
EXTRA_CFLAGS += -I$(src)
EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.43.23\"
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.43.24\"
ifneq ($(SYSSRCHOST1X),)
EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@@ -123,6 +123,9 @@ ifneq ($(wildcard /proc/sgi_uv),)
EXTRA_CFLAGS += -DNV_CONFIG_X86_UV
endif
ifdef VGX_FORCE_VFIO_PCI_CORE
EXTRA_CFLAGS += -DNV_VGPU_FORCE_VFIO_PCI_CORE
endif
#
# The conftest.sh script tests various aspects of the target kernel.

View File

@@ -2067,4 +2067,7 @@ typedef enum
#include <linux/clk-provider.h>
#endif
#define NV_EXPORT_SYMBOL(symbol) EXPORT_SYMBOL_GPL(symbol)
#define NV_CHECK_EXPORT_SYMBOL(symbol) NV_IS_EXPORT_SYMBOL_PRESENT_##symbol
#endif /* _NV_LINUX_H_ */

View File

@@ -924,6 +924,7 @@ NV_STATUS NV_API_CALL rm_ioctl (nvidia_stack_t *, nv_state_t *
NvBool NV_API_CALL rm_isr (nvidia_stack_t *, nv_state_t *, NvU32 *);
void NV_API_CALL rm_isr_bh (nvidia_stack_t *, nv_state_t *);
void NV_API_CALL rm_isr_bh_unlocked (nvidia_stack_t *, nv_state_t *);
NvBool NV_API_CALL rm_is_msix_allowed (nvidia_stack_t *, nv_state_t *);
NV_STATUS NV_API_CALL rm_power_management (nvidia_stack_t *, nv_state_t *, nv_pm_action_t);
NV_STATUS NV_API_CALL rm_stop_user_channels (nvidia_stack_t *, nv_state_t *);
NV_STATUS NV_API_CALL rm_restart_user_channels (nvidia_stack_t *, nv_state_t *);

View File

@@ -207,9 +207,13 @@ enum os_pci_req_atomics_type {
OS_INTF_PCIE_REQ_ATOMICS_128BIT
};
NV_STATUS NV_API_CALL os_enable_pci_req_atomics (void *, enum os_pci_req_atomics_type);
NV_STATUS NV_API_CALL os_get_numa_node_memory_usage (NvS32, NvU64 *, NvU64 *);
NV_STATUS NV_API_CALL os_numa_add_gpu_memory (void *, NvU64, NvU64, NvU32 *);
NV_STATUS NV_API_CALL os_numa_remove_gpu_memory (void *, NvU64, NvU64, NvU32);
NV_STATUS NV_API_CALL os_offline_page_at_address(NvU64 address);
void* NV_API_CALL os_get_pid_info(void);
void NV_API_CALL os_put_pid_info(void *pid_info);
NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid);
extern NvU32 os_page_size;
extern NvU64 os_page_mask;

View File

@@ -316,7 +316,7 @@ export_symbol_present_conftest() {
SYMBOL="$1"
TAB=' '
if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL.*\$" \
if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL\(_GPL\)\?\s*\$" \
"$OUTPUT/Module.symvers" >/dev/null 2>&1; then
echo "#define NV_IS_EXPORT_SYMBOL_PRESENT_$SYMBOL 1" |
append_conftest "symbols"
@@ -337,7 +337,7 @@ export_symbol_gpl_conftest() {
SYMBOL="$1"
TAB=' '
if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\$" \
if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\s*\$" \
"$OUTPUT/Module.symvers" >/dev/null 2>&1; then
echo "#define NV_IS_EXPORT_SYMBOL_GPL_$SYMBOL 1" |
append_conftest "symbols"
@@ -4468,6 +4468,24 @@ compile_test() {
compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE" "" "types"
;;
mmu_notifier_ops_arch_invalidate_secondary_tlbs)
#
# Determine if the mmu_notifier_ops struct has the
# 'arch_invalidate_secondary_tlbs' member.
#
# struct mmu_notifier_ops.invalidate_range was renamed to
# arch_invalidate_secondary_tlbs by commit 1af5a8109904
# ("mmu_notifiers: rename invalidate_range notifier") due to be
# added in v6.6
CODE="
#include <linux/mmu_notifier.h>
int conftest_mmu_notifier_ops_arch_invalidate_secondary_tlbs(void) {
return offsetof(struct mmu_notifier_ops, arch_invalidate_secondary_tlbs);
}"
compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS" "" "types"
;;
drm_format_num_planes)
#
# Determine if drm_format_num_planes() function is present.
@@ -5636,23 +5654,6 @@ compile_test() {
compile_check_conftest "$CODE" "NV_GPIO_TO_IRQ_PRESENT" "" "functions"
;;
migrate_vma_setup)
#
# Determine if migrate_vma_setup() function is present
#
# migrate_vma_setup() function was added by commit
# a7d1f22bb74f32cf3cd93f52776007e161f1a738 ("mm: turn migrate_vma
# upside down) in v5.4.
# (2019-08-20).
CODE="
#include <linux/migrate.h>
int conftest_migrate_vma_setup(void) {
migrate_vma_setup();
}"
compile_check_conftest "$CODE" "NV_MIGRATE_VMA_SETUP_PRESENT" "" "functions"
;;
migrate_vma_added_flags)
#
# Determine if migrate_vma structure has flags
@@ -5743,23 +5744,25 @@ compile_test() {
compile_check_conftest "$CODE" "NV_IOASID_GET_PRESENT" "" "functions"
;;
mm_pasid_set)
mm_pasid_drop)
#
# Determine if mm_pasid_set() function is present
# Determine if mm_pasid_drop() function is present
#
# Added by commit 701fac40384f ("iommu/sva: Assign a PASID to mm
# on PASID allocation and free it on mm exit") in v5.18.
# Moved to linux/iommu.h in commit cd3891158a77 ("iommu/sva: Move
# PASID helpers to sva code") in v6.4.
#
# mm_pasid_set() function was added by commit
# 701fac40384f07197b106136012804c3cae0b3de (iommu/sva: Assign a
# PASID to mm on PASID allocation and free it on mm exit) in v5.18.
# (2022-02-15).
CODE="
#if defined(NV_LINUX_SCHED_MM_H_PRESENT)
#include <linux/sched/mm.h>
#endif
void conftest_mm_pasid_set(void) {
mm_pasid_set();
#include <linux/iommu.h>
void conftest_mm_pasid_drop(void) {
mm_pasid_drop();
}"
compile_check_conftest "$CODE" "NV_MM_PASID_SET_PRESENT" "" "functions"
compile_check_conftest "$CODE" "NV_MM_PASID_DROP_PRESENT" "" "functions"
;;
drm_crtc_state_has_no_vblank)
@@ -6279,6 +6282,21 @@ compile_test() {
compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_MF_SW_SIMULATED_DEFINED" "" "types"
;;
crypto_tfm_ctx_aligned)
# Determine if 'crypto_tfm_ctx_aligned' is defined.
#
# Removed by commit 25c74a39e0f6 ("crypto: hmac - remove unnecessary
# alignment logic") in v6.7.
#
CODE="
#include <crypto/algapi.h>
void conftest_crypto_tfm_ctx_aligned(void) {
(void)crypto_tfm_ctx_aligned();
}"
compile_check_conftest "$CODE" "NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT" "" "functions"
;;
crypto)
#
# Determine if we support various crypto functions.
@@ -6341,6 +6359,22 @@ compile_test() {
compile_check_conftest "$CODE" "NV_MEMPOLICY_HAS_HOME_NODE" "" "types"
;;
mpol_preferred_many_present)
#
# Determine if MPOL_PREFERRED_MANY enum is present or not
#
# Added by commit b27abaccf8e8b ("mm/mempolicy: add
# MPOL_PREFERRED_MANY for multiple preferred nodes") in
# v5.15
#
CODE="
#include <linux/mempolicy.h>
int mpol_preferred_many = MPOL_PREFERRED_MANY;
"
compile_check_conftest "$CODE" "NV_MPOL_PREFERRED_MANY_PRESENT" "" "types"
;;
mmu_interval_notifier)
#
# Determine if mmu_interval_notifier struct is present or not
@@ -6356,6 +6390,21 @@ compile_test() {
compile_check_conftest "$CODE" "NV_MMU_INTERVAL_NOTIFIER" "" "types"
;;
drm_unlocked_ioctl_flag_present)
# Determine if DRM_UNLOCKED IOCTL flag is present.
#
# DRM_UNLOCKED was removed by commit 2798ffcc1d6a ("drm: Remove
# locking for legacy ioctls and DRM_UNLOCKED") in Linux
# next-20231208.
CODE="
#if defined(NV_DRM_DRM_IOCTL_H_PRESENT)
#include <drm/drm_ioctl.h>
#endif
int flags = DRM_UNLOCKED;"
compile_check_conftest "$CODE" "NV_DRM_UNLOCKED_IOCTL_FLAG_PRESENT" "" "types"
;;
# When adding a new conftest entry, please use the correct format for
# specifying the relevant upstream Linux kernel commit.
#
@@ -6680,18 +6729,9 @@ case "$5" in
VFIO_PCI_CORE_PRESENT=1
fi
# When this sanity check is run via nvidia-installer, it sets ARCH as aarch64.
# But, when it is run via Kbuild, ARCH is set as arm64
if [ "$ARCH" = "aarch64" ]; then
ARCH="arm64"
fi
if [ "$VFIO_IOMMU_PRESENT" != "0" ] && [ "$KVM_PRESENT" != "0" ] ; then
# On x86_64, vGPU requires MDEV framework to be present.
# On aarch64, vGPU requires MDEV or vfio-pci-core framework to be present.
if ([ "$ARCH" = "arm64" ] && ([ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ])) ||
([ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" != "0" ];) then
# vGPU requires either MDEV or vfio-pci-core framework to be present.
if [ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ]; then
exit 0
fi
fi
@@ -6702,14 +6742,10 @@ case "$5" in
echo "CONFIG_VFIO_IOMMU_TYPE1";
fi
if [ "$ARCH" = "arm64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
if [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
echo "either CONFIG_VFIO_MDEV or CONFIG_VFIO_PCI_CORE";
fi
if [ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ]; then
echo "CONFIG_VFIO_MDEV";
fi
if [ "$KVM_PRESENT" = "0" ]; then
echo "CONFIG_KVM";
fi

View File

@@ -1312,9 +1312,21 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
DRM_RENDER_ALLOW|DRM_UNLOCKED),
#endif
/*
* DRM_UNLOCKED is implicit for all non-legacy DRM driver IOCTLs since Linux
* v4.10 commit fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions"
* (Linux v4.4 commit ea487835e887 "drm: Enforce unlocked ioctl operation
* for kms driver ioctls" previously did it only for drivers that set the
* DRM_MODESET flag), so this will race with SET_CLIENT_CAP. Linux v4.11
* commit dcf727ab5d17 "drm: setclientcap doesn't need the drm BKL" also
* removed locking from SET_CLIENT_CAP so there is no use attempting to lock
* manually. The latter commit acknowledges that this can expose userspace
* to inconsistent behavior when racing with itself, but accepts that risk.
*/
DRM_IOCTL_DEF_DRV(NVIDIA_GET_CLIENT_CAPABILITY,
nv_drm_get_client_capability_ioctl,
0),
#if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
DRM_IOCTL_DEF_DRV(NVIDIA_GET_CRTC_CRC32,
nv_drm_get_crtc_crc32_ioctl,

View File

@@ -243,6 +243,15 @@ static int __nv_drm_nvkms_gem_obj_init(
NvU64 *pages = NULL;
NvU32 numPages = 0;
if ((size % PAGE_SIZE) != 0) {
NV_DRM_DEV_LOG_ERR(
nv_dev,
"NvKmsKapiMemory 0x%p size should be in a multiple of page size to "
"create a gem object",
pMemory);
return -EINVAL;
}
nv_nvkms_memory->pPhysicalAddress = NULL;
nv_nvkms_memory->pWriteCombinedIORemapAddress = NULL;
nv_nvkms_memory->physically_mapped = false;

View File

@@ -582,6 +582,19 @@ static inline int nv_drm_format_num_planes(uint32_t format)
#endif /* defined(NV_DRM_FORMAT_MODIFIERS_PRESENT) */
/*
* DRM_UNLOCKED was removed with linux-next commit 2798ffcc1d6a ("drm: Remove
* locking for legacy ioctls and DRM_UNLOCKED"), but it was previously made
* implicit for all non-legacy DRM driver IOCTLs since Linux v4.10 commit
* fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions" (Linux v4.4
* commit ea487835e887 "drm: Enforce unlocked ioctl operation for kms driver
* ioctls" previously did it only for drivers that set the DRM_MODESET flag), so
* it was effectively a no-op anyway.
*/
#if !defined(NV_DRM_UNLOCKED_IOCTL_FLAG_PRESENT)
#define DRM_UNLOCKED 0
#endif
/*
* drm_vma_offset_exact_lookup_locked() were added
* by kernel commit 2225cfe46bcc which was Signed-off-by:

View File

@@ -133,3 +133,4 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_lookup
NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_put
NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_dumb_destroy
NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present

View File

@@ -68,6 +68,9 @@ module_param_named(output_rounding_fix, output_rounding_fix, bool, 0400);
static bool disable_vrr_memclk_switch = false;
module_param_named(disable_vrr_memclk_switch, disable_vrr_memclk_switch, bool, 0400);
static bool opportunistic_display_sync = true;
module_param_named(opportunistic_display_sync, opportunistic_display_sync, bool, 0400);
/* These parameters are used for fault injection tests. Normally the defaults
* should be used. */
MODULE_PARM_DESC(fail_malloc, "Fail the Nth call to nvkms_alloc");
@@ -99,6 +102,11 @@ NvBool nvkms_disable_vrr_memclk_switch(void)
return disable_vrr_memclk_switch;
}
NvBool nvkms_opportunistic_display_sync(void)
{
return opportunistic_display_sync;
}
#define NVKMS_SYNCPT_STUBS_NEEDED
/*************************************************************************
@@ -200,9 +208,23 @@ static inline int nvkms_read_trylock_pm_lock(void)
static inline void nvkms_read_lock_pm_lock(void)
{
while (!down_read_trylock(&nvkms_pm_lock)) {
try_to_freeze();
cond_resched();
if ((current->flags & PF_NOFREEZE)) {
/*
* Non-freezable tasks (i.e. kthreads in this case) don't have to worry
* about being frozen during system suspend, but do need to block so
* that the CPU can go idle during s2idle. Do a normal uninterruptible
* blocking wait for the PM lock.
*/
down_read(&nvkms_pm_lock);
} else {
/*
* For freezable tasks, make sure we give the kernel an opportunity to
* freeze if taking the PM lock fails.
*/
while (!down_read_trylock(&nvkms_pm_lock)) {
try_to_freeze();
cond_resched();
}
}
}

View File

@@ -99,6 +99,7 @@ typedef struct {
NvBool nvkms_output_rounding_fix(void);
NvBool nvkms_disable_vrr_memclk_switch(void);
NvBool nvkms_opportunistic_display_sync(void);
void nvkms_call_rm (void *ops);
void* nvkms_alloc (size_t size,

View File

@@ -1,20 +1,25 @@
/* SPDX-License-Identifier: Linux-OpenIB */
/*
* Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
@@ -43,7 +48,9 @@
MODULE_AUTHOR("Yishai Hadas");
MODULE_DESCRIPTION("NVIDIA GPU memory plug-in");
MODULE_LICENSE("Linux-OpenIB");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_VERSION(DRV_VERSION);
enum {
NV_MEM_PEERDIRECT_SUPPORT_DEFAULT = 0,
@@ -53,7 +60,13 @@ static int peerdirect_support = NV_MEM_PEERDIRECT_SUPPORT_DEFAULT;
module_param(peerdirect_support, int, S_IRUGO);
MODULE_PARM_DESC(peerdirect_support, "Set level of support for Peer-direct, 0 [default] or 1 [legacy, for example MLNX_OFED 4.9 LTS]");
#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d " FMT, __FUNCTION__, __LINE__, ## ARGS)
#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d ERROR " FMT, __FUNCTION__, __LINE__, ## ARGS)
#ifdef NV_MEM_DEBUG
#define peer_trace(FMT, ARGS...) printk(KERN_DEBUG "nvidia-peermem" " %s:%d TRACE " FMT, __FUNCTION__, __LINE__, ## ARGS)
#else
#define peer_trace(FMT, ARGS...) do {} while (0)
#endif
#if defined(NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)
@@ -74,7 +87,10 @@ invalidate_peer_memory mem_invalidate_callback;
static void *reg_handle = NULL;
static void *reg_handle_nc = NULL;
#define NV_MEM_CONTEXT_MAGIC ((u64)0xF1F4F1D0FEF0DAD0ULL)
struct nv_mem_context {
u64 pad1;
struct nvidia_p2p_page_table *page_table;
struct nvidia_p2p_dma_mapping *dma_mapping;
u64 core_context;
@@ -86,8 +102,22 @@ struct nv_mem_context {
struct task_struct *callback_task;
int sg_allocated;
struct sg_table sg_head;
u64 pad2;
};
#define NV_MEM_CONTEXT_CHECK_OK(MC) ({ \
struct nv_mem_context *mc = (MC); \
int rc = ((0 != mc) && \
(READ_ONCE(mc->pad1) == NV_MEM_CONTEXT_MAGIC) && \
(READ_ONCE(mc->pad2) == NV_MEM_CONTEXT_MAGIC)); \
if (!rc) { \
peer_trace("invalid nv_mem_context=%px pad1=%016llx pad2=%016llx\n", \
mc, \
mc?mc->pad1:0, \
mc?mc->pad2:0); \
} \
rc; \
})
static void nv_get_p2p_free_callback(void *data)
{
@@ -97,8 +127,9 @@ static void nv_get_p2p_free_callback(void *data)
struct nvidia_p2p_dma_mapping *dma_mapping = NULL;
__module_get(THIS_MODULE);
if (!nv_mem_context) {
peer_err("nv_get_p2p_free_callback -- invalid nv_mem_context\n");
if (!NV_MEM_CONTEXT_CHECK_OK(nv_mem_context)) {
peer_err("detected invalid context, skipping further processing\n");
goto out;
}
@@ -169,9 +200,11 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
/* Error case handled as not mine */
return 0;
nv_mem_context->pad1 = NV_MEM_CONTEXT_MAGIC;
nv_mem_context->page_virt_start = addr & GPU_PAGE_MASK;
nv_mem_context->page_virt_end = (addr + size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
nv_mem_context->mapped_size = nv_mem_context->page_virt_end - nv_mem_context->page_virt_start;
nv_mem_context->pad2 = NV_MEM_CONTEXT_MAGIC;
ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
&nv_mem_context->page_table, nv_mem_dummy_callback, nv_mem_context);
@@ -195,6 +228,7 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
return 1;
err:
memset(nv_mem_context, 0, sizeof(*nv_mem_context));
kfree(nv_mem_context);
/* Error case handled as not mine */
@@ -342,6 +376,7 @@ static void nv_mem_release(void *context)
sg_free_table(&nv_mem_context->sg_head);
nv_mem_context->sg_allocated = 0;
}
memset(nv_mem_context, 0, sizeof(*nv_mem_context));
kfree(nv_mem_context);
module_put(THIS_MODULE);
return;

View File

@@ -81,8 +81,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
@@ -100,6 +99,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
@@ -110,6 +110,8 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_mm_arg
NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_pt_regs_arg
NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_unified_nodes
NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_home_node
NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup

View File

@@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
static void uvm_vm_close_managed(struct vm_area_struct *vma)
{
uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
uvm_processor_id_t gpu_id;
bool make_zombie = false;
if (current->mm != NULL)
@@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)
uvm_destroy_vma_managed(vma, make_zombie);
// Notify GPU address spaces that the fault buffer needs to be flushed to
// avoid finding stale entries that can be attributed to new VA ranges
// reallocated at the same address.
for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
}
uvm_va_space_up_write(va_space);
if (current->mm != NULL)

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2021 NVIDIA Corporation
Copyright (c) 2021-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -94,4 +94,6 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->map_remap_larger_page_promotion = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -101,4 +101,6 @@ void uvm_hal_ampere_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->map_remap_larger_page_promotion = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2018 NVIDIA Corporation
Copyright (c) 2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -107,10 +107,10 @@ static NV_STATUS service_ats_faults(uvm_gpu_va_space_t *gpu_va_space,
return status;
}
static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
NvU64 addr,
size_t size,
uvm_fault_client_type_t client_type)
static void flush_tlb_va_region(uvm_gpu_va_space_t *gpu_va_space,
NvU64 addr,
size_t size,
uvm_fault_client_type_t client_type)
{
uvm_ats_fault_invalidate_t *ats_invalidate;
@@ -119,12 +119,12 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
else
ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.non_replayable.ats_invalidate;
if (!ats_invalidate->write_faults_in_batch) {
uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
ats_invalidate->write_faults_in_batch = true;
if (!ats_invalidate->tlb_batch_pending) {
uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->tlb_batch);
ats_invalidate->tlb_batch_pending = true;
}
uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
uvm_tlb_batch_invalidate(&ats_invalidate->tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
}
static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,
@@ -149,7 +149,11 @@ static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,
mode = vma_policy->mode;
if ((mode == MPOL_BIND) || (mode == MPOL_PREFERRED_MANY) || (mode == MPOL_PREFERRED)) {
if ((mode == MPOL_BIND)
#if defined(NV_MPOL_PREFERRED_MANY_PRESENT)
|| (mode == MPOL_PREFERRED_MANY)
#endif
|| (mode == MPOL_PREFERRED)) {
int home_node = NUMA_NO_NODE;
#if defined(NV_MEMPOLICY_HAS_HOME_NODE)
@@ -467,6 +471,10 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
uvm_page_mask_and(write_fault_mask, write_fault_mask, read_fault_mask);
else
uvm_page_mask_zero(write_fault_mask);
// There are no pending faults beyond write faults to RO region.
if (uvm_page_mask_empty(read_fault_mask))
return status;
}
ats_batch_select_residency(gpu_va_space, vma, ats_context);
@@ -489,6 +497,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
if (vma->vm_flags & VM_WRITE) {
uvm_page_mask_region_fill(faults_serviced_mask, subregion);
uvm_ats_smmu_invalidate_tlbs(gpu_va_space, start, length);
// The Linux kernel never invalidates TLB entries on mapping
// permission upgrade. This is a problem if the GPU has cached
@@ -499,7 +508,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
// infinite loop because we just forward the fault to the Linux
// kernel and it will see that the permissions in the page table are
// correct. Therefore, we flush TLB entries on ATS write faults.
flush_tlb_write_faults(gpu_va_space, start, length, client_type);
flush_tlb_va_region(gpu_va_space, start, length, client_type);
}
else {
uvm_page_mask_region_fill(reads_serviced_mask, subregion);
@@ -522,6 +531,15 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
return status;
uvm_page_mask_region_fill(faults_serviced_mask, subregion);
// Similarly to permission upgrade scenario, discussed above, GPU
// will not re-fetch the entry if the PTE is invalid and page size
// is 4K. To avoid infinite faulting loop, invalidate TLB for every
// new translation written explicitly like in the case of permission
// upgrade.
if (PAGE_SIZE == UVM_PAGE_SIZE_4K)
flush_tlb_va_region(gpu_va_space, start, length, client_type);
}
return status;
@@ -556,7 +574,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
NV_STATUS status;
uvm_push_t push;
if (!ats_invalidate->write_faults_in_batch)
if (!ats_invalidate->tlb_batch_pending)
return NV_OK;
UVM_ASSERT(gpu_va_space);
@@ -568,7 +586,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
"Invalidate ATS entries");
if (status == NV_OK) {
uvm_tlb_batch_end(&ats_invalidate->write_faults_tlb_batch, &push, UVM_MEMBAR_NONE);
uvm_tlb_batch_end(&ats_invalidate->tlb_batch, &push, UVM_MEMBAR_NONE);
uvm_push_end(&push);
// Add this push to the GPU's tracker so that fault replays/clears can
@@ -576,8 +594,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
status = uvm_tracker_add_push_safe(out_tracker, &push);
}
ats_invalidate->write_faults_in_batch = false;
ats_invalidate->tlb_batch_pending = false;
return status;
}

View File

@@ -52,7 +52,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
bool uvm_ats_check_in_gmmu_region(uvm_va_space_t *va_space, NvU64 address, uvm_va_range_t *next);
// This function performs pending TLB invalidations for ATS and clears the
// ats_invalidate->write_faults_in_batch flag
// ats_invalidate->tlb_batch_pending flag
NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
uvm_ats_fault_invalidate_t *ats_invalidate,
uvm_tracker_t *out_tracker);

View File

@@ -29,8 +29,13 @@
#include "uvm_va_space.h"
#include "uvm_va_space_mm.h"
#include <asm/io.h>
#include <linux/log2.h>
#include <linux/iommu.h>
#include <linux/mm_types.h>
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/mmu_context.h>
// linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
// reference required for the iommu_sva_bind_device() call. This header is not
@@ -46,17 +51,276 @@
#define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
#endif
// Type to represent a 128-bit SMMU command queue command.
struct smmu_cmd {
NvU64 low;
NvU64 high;
};
// Base address of SMMU CMDQ-V for GSMMU0.
#define SMMU_CMDQV_BASE_ADDR(smmu_base) (smmu_base + 0x200000)
#define SMMU_CMDQV_BASE_LEN 0x00830000
// CMDQV configuration is done by firmware but we check status here.
#define SMMU_CMDQV_CONFIG 0x0
#define SMMU_CMDQV_CONFIG_CMDQV_EN BIT(0)
// Used to map a particular VCMDQ to a VINTF.
#define SMMU_CMDQV_CMDQ_ALLOC_MAP(vcmdq_id) (0x200 + 0x4 * (vcmdq_id))
#define SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC BIT(0)
// Shift for the field containing the index of the virtual interface
// owning the VCMDQ.
#define SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT 15
// Base address for the VINTF registers.
#define SMMU_VINTF_BASE_ADDR(cmdqv_base_addr, vintf_id) (cmdqv_base_addr + 0x1000 + 0x100 * (vintf_id))
// Virtual interface (VINTF) configuration registers. The WAR only
// works on baremetal so we need to configure ourselves as the
// hypervisor owner.
#define SMMU_VINTF_CONFIG 0x0
#define SMMU_VINTF_CONFIG_ENABLE BIT(0)
#define SMMU_VINTF_CONFIG_HYP_OWN BIT(17)
#define SMMU_VINTF_STATUS 0x0
#define SMMU_VINTF_STATUS_ENABLED BIT(0)
// Caclulates the base address for a particular VCMDQ instance.
#define SMMU_VCMDQ_BASE_ADDR(cmdqv_base_addr, vcmdq_id) (cmdqv_base_addr + 0x10000 + 0x80 * (vcmdq_id))
// SMMU command queue consumer index register. Updated by SMMU
// when commands are consumed.
#define SMMU_VCMDQ_CONS 0x0
// SMMU command queue producer index register. Updated by UVM when
// commands are added to the queue.
#define SMMU_VCMDQ_PROD 0x4
// Configuration register used to enable a VCMDQ.
#define SMMU_VCMDQ_CONFIG 0x8
#define SMMU_VCMDQ_CONFIG_ENABLE BIT(0)
// Status register used to check the VCMDQ is enabled.
#define SMMU_VCMDQ_STATUS 0xc
#define SMMU_VCMDQ_STATUS_ENABLED BIT(0)
// Base address offset for the VCMDQ registers.
#define SMMU_VCMDQ_CMDQ_BASE 0x10000
// Size of the command queue. Each command is 16 bytes and we can't
// have a command queue greater than one page in size.
#define SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE (PAGE_SHIFT - ilog2(sizeof(struct smmu_cmd)))
#define SMMU_VCMDQ_CMDQ_ENTRIES (1UL << SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE)
// We always use VINTF63 for the WAR
#define VINTF 63
static void smmu_vintf_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
{
iowrite32(val, SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
}
static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)
{
return ioread32(SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
}
// We always use VCMDQ127 for the WAR
#define VCMDQ 127
void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
{
iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
{
return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
static void smmu_vcmdq_write64(void __iomem *smmu_cmdqv_base, int reg, NvU64 val)
{
iowrite64(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
}
// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
// TLB invalidates on read-only to read-write upgrades
static NV_STATUS uvm_ats_smmu_war_init(uvm_parent_gpu_t *parent_gpu)
{
uvm_spin_loop_t spin;
NV_STATUS status;
unsigned long cmdqv_config;
void __iomem *smmu_cmdqv_base;
struct acpi_iort_node *node;
struct acpi_iort_smmu_v3 *iort_smmu;
node = *(struct acpi_iort_node **) dev_get_platdata(parent_gpu->pci_dev->dev.iommu->iommu_dev->dev->parent);
iort_smmu = (struct acpi_iort_smmu_v3 *) node->node_data;
smmu_cmdqv_base = ioremap(SMMU_CMDQV_BASE_ADDR(iort_smmu->base_address), SMMU_CMDQV_BASE_LEN);
if (!smmu_cmdqv_base)
return NV_ERR_NO_MEMORY;
parent_gpu->smmu_war.smmu_cmdqv_base = smmu_cmdqv_base;
cmdqv_config = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CONFIG);
if (!(cmdqv_config & SMMU_CMDQV_CONFIG_CMDQV_EN)) {
status = NV_ERR_OBJECT_NOT_FOUND;
goto out;
}
// Allocate SMMU CMDQ pages for WAR
parent_gpu->smmu_war.smmu_cmdq = alloc_page(NV_UVM_GFP_FLAGS | __GFP_ZERO);
if (!parent_gpu->smmu_war.smmu_cmdq) {
status = NV_ERR_NO_MEMORY;
goto out;
}
// Initialise VINTF for the WAR
smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, SMMU_VINTF_CONFIG_ENABLE | SMMU_VINTF_CONFIG_HYP_OWN);
UVM_SPIN_WHILE(!(smmu_vintf_read32(smmu_cmdqv_base, SMMU_VINTF_STATUS) & SMMU_VINTF_STATUS_ENABLED), &spin);
// Allocate VCMDQ to VINTF
iowrite32((VINTF << SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT) | SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC,
smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
smmu_vcmdq_write64(smmu_cmdqv_base, SMMU_VCMDQ_CMDQ_BASE,
page_to_phys(parent_gpu->smmu_war.smmu_cmdq) | SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONS, 0);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_PROD, 0);
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, SMMU_VCMDQ_CONFIG_ENABLE);
UVM_SPIN_WHILE(!(smmu_vcmdq_read32(smmu_cmdqv_base, SMMU_VCMDQ_STATUS) & SMMU_VCMDQ_STATUS_ENABLED), &spin);
uvm_mutex_init(&parent_gpu->smmu_war.smmu_lock, UVM_LOCK_ORDER_LEAF);
parent_gpu->smmu_war.smmu_prod = 0;
parent_gpu->smmu_war.smmu_cons = 0;
return NV_OK;
out:
iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
parent_gpu->smmu_war.smmu_cmdqv_base = NULL;
return status;
}
static void uvm_ats_smmu_war_deinit(uvm_parent_gpu_t *parent_gpu)
{
void __iomem *smmu_cmdqv_base = parent_gpu->smmu_war.smmu_cmdqv_base;
NvU32 cmdq_alloc_map;
if (parent_gpu->smmu_war.smmu_cmdqv_base) {
smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, 0);
cmdq_alloc_map = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
iowrite32(cmdq_alloc_map & SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC, smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, 0);
}
if (parent_gpu->smmu_war.smmu_cmdq)
__free_page(parent_gpu->smmu_war.smmu_cmdq);
if (parent_gpu->smmu_war.smmu_cmdqv_base)
iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
}
// The SMMU on ARM64 can run under different translation regimes depending on
// what features the OS and CPU variant support. The CPU for GH180 supports
// virtualisation extensions and starts the kernel at EL2 meaning SMMU operates
// under the NS-EL2-E2H translation regime. Therefore we need to use the
// TLBI_EL2_* commands which invalidate TLB entries created under this
// translation regime.
#define CMDQ_OP_TLBI_EL2_ASID 0x21;
#define CMDQ_OP_TLBI_EL2_VA 0x22;
#define CMDQ_OP_CMD_SYNC 0x46
// Use the same maximum as used for MAX_TLBI_OPS in the upstream
// kernel.
#define UVM_MAX_TLBI_OPS (1UL << (PAGE_SHIFT - 3))
#if UVM_ATS_SMMU_WAR_REQUIRED()
void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
uvm_parent_gpu_t *parent_gpu = gpu_va_space->gpu->parent;
struct {
NvU64 low;
NvU64 high;
} *vcmdq;
unsigned long vcmdq_prod;
NvU64 end;
uvm_spin_loop_t spin;
NvU16 asid;
if (!parent_gpu->smmu_war.smmu_cmdqv_base)
return;
asid = arm64_mm_context_get(mm);
vcmdq = kmap(parent_gpu->smmu_war.smmu_cmdq);
uvm_mutex_lock(&parent_gpu->smmu_war.smmu_lock);
vcmdq_prod = parent_gpu->smmu_war.smmu_prod;
// Our queue management is very simple. The mutex prevents multiple
// producers writing to the queue and all our commands require waiting for
// the queue to drain so we know it's empty. If we can't fit enough commands
// in the queue we just invalidate the whole ASID.
//
// The command queue is a cirular buffer with the MSB representing a wrap
// bit that must toggle on each wrap. See the SMMU architecture
// specification for more details.
//
// SMMU_VCMDQ_CMDQ_ENTRIES - 1 because we need to leave space for the
// CMD_SYNC.
if ((size >> PAGE_SHIFT) > min(UVM_MAX_TLBI_OPS, SMMU_VCMDQ_CMDQ_ENTRIES - 1)) {
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_ASID;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0;
vcmdq_prod++;
}
else {
for (end = addr + size; addr < end; addr += PAGE_SIZE) {
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_VA;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = addr & ~((1UL << 12) - 1);
vcmdq_prod++;
}
}
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_CMD_SYNC;
vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0x0;
vcmdq_prod++;
// MSB is the wrap bit
vcmdq_prod &= (1UL << (SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 1)) - 1;
parent_gpu->smmu_war.smmu_prod = vcmdq_prod;
smmu_vcmdq_write32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_PROD, parent_gpu->smmu_war.smmu_prod);
UVM_SPIN_WHILE(
(smmu_vcmdq_read32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_CONS) & GENMASK(19, 0)) != vcmdq_prod,
&spin);
uvm_mutex_unlock(&parent_gpu->smmu_war.smmu_lock);
kunmap(parent_gpu->smmu_war.smmu_cmdq);
arm64_mm_context_put(mm);
}
#endif
NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
{
int ret;
ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
if (ret)
return errno_to_nv_status(ret);
return errno_to_nv_status(ret);
if (UVM_ATS_SMMU_WAR_REQUIRED())
return uvm_ats_smmu_war_init(parent_gpu);
else
return NV_OK;
}
void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
{
if (UVM_ATS_SMMU_WAR_REQUIRED())
uvm_ats_smmu_war_deinit(parent_gpu);
iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
}

View File

@@ -32,23 +32,38 @@
// For ATS support on aarch64, arm_smmu_sva_bind() is needed for
// iommu_sva_bind_device() calls. Unfortunately, arm_smmu_sva_bind() is not
// conftest-able. We instead look for the presence of ioasid_get() or
// mm_pasid_set(). ioasid_get() was added in the same patch series as
// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_set() was added in the
// mm_pasid_drop(). ioasid_get() was added in the same patch series as
// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_drop() was added in the
// same patch as the removal of ioasid_get(). We assume the presence of
// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_set(v5.18+) is
// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_drop(v5.18+) is
// present.
//
// arm_smmu_sva_bind() was added with commit
// 32784a9562fb0518b12e9797ee2aec52214adf6f and ioasid_get() was added with
// commit cb4789b0d19ff231ce9f73376a023341300aed96 (11/23/2020). Commit
// 701fac40384f07197b106136012804c3cae0b3de (02/15/2022) removed ioasid_get()
// and added mm_pasid_set().
#if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_SET_PRESENT))
#define UVM_ATS_SVA_SUPPORTED() 1
// and added mm_pasid_drop().
#if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT))
#if defined(CONFIG_IOMMU_SVA)
#define UVM_ATS_SVA_SUPPORTED() 1
#else
#define UVM_ATS_SVA_SUPPORTED() 0
#endif
#else
#define UVM_ATS_SVA_SUPPORTED() 0
#endif
// If NV_ARCH_INVALIDATE_SECONDARY_TLBS is defined it means the upstream fix is
// in place so no need for the WAR from Bug 4130089: [GH180][r535] WAR for
// kernel not issuing SMMU TLB invalidates on read-only
#if defined(NV_ARCH_INVALIDATE_SECONDARY_TLBS)
#define UVM_ATS_SMMU_WAR_REQUIRED() 0
#elif NVCPU_IS_AARCH64
#define UVM_ATS_SMMU_WAR_REQUIRED() 1
#else
#define UVM_ATS_SMMU_WAR_REQUIRED() 0
#endif
typedef struct
{
int placeholder;
@@ -77,6 +92,17 @@ typedef struct
// LOCKING: None
void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
// TLB invalidates on read-only to read-write upgrades
#if UVM_ATS_SMMU_WAR_REQUIRED()
void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size);
#else
static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
}
#endif
#else
static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
{
@@ -107,6 +133,11 @@ typedef struct
{
}
static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
{
}
#endif // UVM_ATS_SVA_SUPPORTED
#endif // __UVM_ATS_SVA_H__

View File

@@ -191,7 +191,7 @@ static NV_STATUS test_membar(uvm_gpu_t *gpu)
for (i = 0; i < REDUCTIONS; ++i) {
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS + 1);
gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS);
}
// Without a sys membar the channel tracking semaphore can and does complete
@@ -577,7 +577,7 @@ static NV_STATUS test_semaphore_reduction_inc(uvm_gpu_t *gpu)
for (i = 0; i < REDUCTIONS; i++) {
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, i+1);
gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, REDUCTIONS);
}
status = uvm_push_end_and_wait(&push);

View File

@@ -21,8 +21,8 @@
*******************************************************************************/
#ifndef _UVM_COMMON_H
#define _UVM_COMMON_H
#ifndef __UVM_COMMON_H__
#define __UVM_COMMON_H__
#ifdef DEBUG
#define UVM_IS_DEBUG() 1
@@ -413,4 +413,40 @@ static inline void uvm_touch_page(struct page *page)
// Return true if the VMA is one used by UVM managed allocations.
bool uvm_vma_is_managed(struct vm_area_struct *vma);
#endif /* _UVM_COMMON_H */
static bool uvm_platform_uses_canonical_form_address(void)
{
if (NVCPU_IS_PPC64LE)
return false;
return true;
}
// Similar to the GPU MMU HAL num_va_bits(), it returns the CPU's num_va_bits().
static NvU32 uvm_cpu_num_va_bits(void)
{
return fls64(TASK_SIZE - 1) + 1;
}
// Return the unaddressable range in a num_va_bits-wide VA space, [first, outer)
static void uvm_get_unaddressable_range(NvU32 num_va_bits, NvU64 *first, NvU64 *outer)
{
UVM_ASSERT(num_va_bits < 64);
UVM_ASSERT(first);
UVM_ASSERT(outer);
if (uvm_platform_uses_canonical_form_address()) {
*first = 1ULL << (num_va_bits - 1);
*outer = (NvU64)((NvS64)(1ULL << 63) >> (64 - num_va_bits));
}
else {
*first = 1ULL << num_va_bits;
*outer = ~0Ull;
}
}
static void uvm_cpu_get_unaddressable_range(NvU64 *first, NvU64 *outer)
{
return uvm_get_unaddressable_range(uvm_cpu_num_va_bits(), first, outer);
}
#endif /* __UVM_COMMON_H__ */

View File

@@ -218,19 +218,12 @@ static bool gpu_supports_uvm(uvm_parent_gpu_t *parent_gpu)
return parent_gpu->rm_info.subdeviceCount == 1;
}
static bool platform_uses_canonical_form_address(void)
{
if (NVCPU_IS_PPC64LE)
return false;
return true;
}
bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
{
// Lower and upper address spaces are typically found in platforms that use
// the canonical address form.
NvU64 max_va_lower;
NvU64 min_va_upper;
NvU64 addr_end = addr + size - 1;
NvU8 gpu_addr_shift;
NvU8 cpu_addr_shift;
@@ -243,7 +236,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
UVM_ASSERT(size > 0);
gpu_addr_shift = gpu->address_space_tree.hal->num_va_bits();
cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
cpu_addr_shift = uvm_cpu_num_va_bits();
addr_shift = gpu_addr_shift;
// Pascal+ GPUs are capable of accessing kernel pointers in various modes
@@ -279,9 +272,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
// 0 +----------------+ 0 +----------------+
// On canonical form address platforms and Pascal+ GPUs.
if (platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
NvU64 min_va_upper;
if (uvm_platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
// On x86, when cpu_addr_shift > gpu_addr_shift, it means the CPU uses
// 5-level paging and the GPU is pre-Hopper. On Pascal-Ada GPUs (49b
// wide VA) we set addr_shift to match a 4-level paging x86 (48b wide).
@@ -292,15 +283,11 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
addr_shift = gpu_addr_shift;
else
addr_shift = cpu_addr_shift;
}
min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - addr_shift));
max_va_lower = 1ULL << (addr_shift - 1);
return (addr_end < max_va_lower) || (addr >= min_va_upper);
}
else {
max_va_lower = 1ULL << addr_shift;
return addr_end < max_va_lower;
}
uvm_get_unaddressable_range(addr_shift, &max_va_lower, &min_va_upper);
return (addr_end < max_va_lower) || (addr >= min_va_upper);
}
// The internal UVM VAS does not use canonical form addresses.
@@ -326,14 +313,14 @@ NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr)
NvU8 addr_shift;
NvU64 input_addr = addr;
if (platform_uses_canonical_form_address()) {
if (uvm_platform_uses_canonical_form_address()) {
// When the CPU VA width is larger than GPU's, it means that:
// On ARM: the CPU is on LVA mode and the GPU is pre-Hopper.
// On x86: the CPU uses 5-level paging and the GPU is pre-Hopper.
// We sign-extend on the 48b on ARM and on the 47b on x86 to mirror the
// behavior of CPUs with smaller (than GPU) VA widths.
gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
cpu_addr_shift = uvm_cpu_num_va_bits();
if (cpu_addr_shift > gpu_addr_shift)
addr_shift = NVCPU_IS_X86_64 ? 48 : 49;

View File

@@ -57,14 +57,16 @@
typedef struct
{
// Number of faults from this uTLB that have been fetched but have not been serviced yet
// Number of faults from this uTLB that have been fetched but have not been
// serviced yet.
NvU32 num_pending_faults;
// Whether the uTLB contains fatal faults
bool has_fatal_faults;
// We have issued a replay of type START_ACK_ALL while containing fatal faults. This puts
// the uTLB in lockdown mode and no new translations are accepted
// We have issued a replay of type START_ACK_ALL while containing fatal
// faults. This puts the uTLB in lockdown mode and no new translations are
// accepted.
bool in_lockdown;
// We have issued a cancel on this uTLB
@@ -126,8 +128,8 @@ struct uvm_service_block_context_struct
struct list_head service_context_list;
// A mask of GPUs that need to be checked for ECC errors before the CPU
// fault handler returns, but after the VA space lock has been unlocked to
// avoid the RM/UVM VA space lock deadlocks.
// fault handler returns, but after the VA space lock has been unlocked
// to avoid the RM/UVM VA space lock deadlocks.
uvm_processor_mask_t gpus_to_check_for_ecc;
// This is set to throttle page fault thrashing.
@@ -160,9 +162,9 @@ struct uvm_service_block_context_struct
struct
{
// Per-processor mask with the pages that will be resident after servicing.
// We need one mask per processor because we may coalesce faults that
// trigger migrations to different processors.
// Per-processor mask with the pages that will be resident after
// servicing. We need one mask per processor because we may coalesce
// faults that trigger migrations to different processors.
uvm_page_mask_t new_residency;
} per_processor_masks[UVM_ID_MAX_PROCESSORS];
@@ -263,7 +265,10 @@ struct uvm_fault_service_batch_context_struct
NvU32 num_coalesced_faults;
bool has_fatal_faults;
// One of the VA spaces in this batch which had fatal faults. If NULL, no
// faults were fatal. More than one VA space could have fatal faults, but we
// pick one to be the target of the cancel sequence.
uvm_va_space_t *fatal_va_space;
bool has_throttled_faults;
@@ -291,11 +296,8 @@ struct uvm_fault_service_batch_context_struct
struct uvm_ats_fault_invalidate_struct
{
// Whether the TLB batch contains any information
bool write_faults_in_batch;
// Batch of TLB entries to be invalidated
uvm_tlb_batch_t write_faults_tlb_batch;
bool tlb_batch_pending;
uvm_tlb_batch_t tlb_batch;
};
typedef struct
@@ -440,20 +442,9 @@ struct uvm_access_counter_service_batch_context_struct
NvU32 num_notifications;
// Boolean used to avoid sorting the fault batch by instance_ptr if we
// determine at fetch time that all the access counter notifications in the
// batch report the same instance_ptr
// determine at fetch time that all the access counter notifications in
// the batch report the same instance_ptr
bool is_single_instance_ptr;
// Scratch space, used to generate artificial physically addressed notifications.
// Virtual address notifications are always aligned to 64k. This means up to 16
// different physical locations could have been accessed to trigger one notification.
// The sub-granularity mask can correspond to any of them.
struct
{
uvm_processor_id_t resident_processors[16];
uvm_gpu_phys_address_t phys_addresses[16];
uvm_access_counter_buffer_entry_t phys_entry;
} scratch;
} virt;
struct
@@ -464,8 +455,8 @@ struct uvm_access_counter_service_batch_context_struct
NvU32 num_notifications;
// Boolean used to avoid sorting the fault batch by aperture if we
// determine at fetch time that all the access counter notifications in the
// batch report the same aperture
// determine at fetch time that all the access counter notifications in
// the batch report the same aperture
bool is_single_aperture;
} phys;
@@ -661,8 +652,8 @@ struct uvm_gpu_struct
struct
{
// Big page size used by the internal UVM VA space
// Notably it may be different than the big page size used by a user's VA
// space in general.
// Notably it may be different than the big page size used by a user's
// VA space in general.
NvU32 internal_size;
} big_page;
@@ -688,8 +679,8 @@ struct uvm_gpu_struct
// lazily-populated array of peer GPUs, indexed by the peer's GPU index
uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS];
// Leaf spinlock used to synchronize access to the peer_gpus table so that
// it can be safely accessed from the access counters bottom half
// Leaf spinlock used to synchronize access to the peer_gpus table so
// that it can be safely accessed from the access counters bottom half
uvm_spinlock_t peer_gpus_lock;
} peer_info;
@@ -980,6 +971,10 @@ struct uvm_parent_gpu_struct
bool plc_supported;
// If true, page_tree initialization pre-populates no_ats_ranges. It only
// affects ATS systems.
bool no_ats_range_required;
// Parameters used by the TLB batching API
struct
{
@@ -1051,14 +1046,16 @@ struct uvm_parent_gpu_struct
// Interrupt handling state and locks
uvm_isr_info_t isr;
// Fault buffer info. This is only valid if supports_replayable_faults is set to true
// Fault buffer info. This is only valid if supports_replayable_faults is
// set to true.
uvm_fault_buffer_info_t fault_buffer_info;
// PMM lazy free processing queue.
// TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
nv_kthread_q_t lazy_free_q;
// Access counter buffer info. This is only valid if supports_access_counters is set to true
// Access counter buffer info. This is only valid if
// supports_access_counters is set to true.
uvm_access_counter_buffer_info_t access_counter_buffer_info;
// Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs.
@@ -1108,7 +1105,7 @@ struct uvm_parent_gpu_struct
uvm_rb_tree_t instance_ptr_table;
uvm_spinlock_t instance_ptr_table_lock;
// This is set to true if the GPU belongs to an SLI group. Else, set to false.
// This is set to true if the GPU belongs to an SLI group.
bool sli_enabled;
struct
@@ -1135,8 +1132,8 @@ struct uvm_parent_gpu_struct
// environment, rather than using the peer-id field of the PTE (which can
// only address 8 gpus), all gpus are assigned a 47-bit physical address
// space by the fabric manager. Any physical address access to these
// physical address spaces are routed through the switch to the corresponding
// peer.
// physical address spaces are routed through the switch to the
// corresponding peer.
struct
{
bool is_nvswitch_connected;
@@ -1162,6 +1159,16 @@ struct uvm_parent_gpu_struct
NvU64 memory_window_start;
NvU64 memory_window_end;
} system_bus;
// WAR to issue ATS TLB invalidation commands ourselves.
struct
{
uvm_mutex_t smmu_lock;
struct page *smmu_cmdq;
void __iomem *smmu_cmdqv_base;
unsigned long smmu_prod;
unsigned long smmu_cons;
} smmu_war;
};
static const char *uvm_gpu_name(uvm_gpu_t *gpu)
@@ -1351,7 +1358,8 @@ void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
// They must not be the same gpu.
uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);
// Get the processor id accessible by the given GPU for the given physical address
// Get the processor id accessible by the given GPU for the given physical
// address.
uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);
// Get the P2P capabilities between the gpus with the given indexes
@@ -1448,9 +1456,9 @@ NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);
// Check for ECC errors without calling into RM
//
// Calling into RM is problematic in many places, this check is always safe to do.
// Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error and
// it's required to call uvm_gpu_check_ecc_error() to be sure.
// Calling into RM is problematic in many places, this check is always safe to
// do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error
// and it's required to call uvm_gpu_check_ecc_error() to be sure.
NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);
// Map size bytes of contiguous sysmem on the GPU for physical access
@@ -1507,6 +1515,8 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
// The GPU must be initialized before calling this function.
bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
bool uvm_platform_uses_canonical_form_address(void);
// Returns addr's canonical form for host systems that use canonical form
// addresses.
NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
@@ -1553,8 +1563,9 @@ uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu);
// Debug print of GPU properties
void uvm_gpu_print(uvm_gpu_t *gpu);
// Add the given instance pointer -> user_channel mapping to this GPU. The bottom
// half GPU page fault handler uses this to look up the VA space for GPU faults.
// Add the given instance pointer -> user_channel mapping to this GPU. The
// bottom half GPU page fault handler uses this to look up the VA space for GPU
// faults.
NV_STATUS uvm_gpu_add_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);
void uvm_gpu_remove_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);

View File

@@ -33,17 +33,17 @@
#include "uvm_va_space_mm.h"
#include "uvm_pmm_sysmem.h"
#include "uvm_perf_module.h"
#include "uvm_ats_ibm.h"
#define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_MIN 1
#define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT 256
#define UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT "2m"
#define UVM_PERF_ACCESS_COUNTER_GRANULARITY UVM_ACCESS_COUNTER_GRANULARITY_2M
#define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MIN 1
#define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MAX ((1 << 16) - 1)
#define UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT 256
#define UVM_ACCESS_COUNTER_ACTION_NOTIFY 0x1
#define UVM_ACCESS_COUNTER_ACTION_CLEAR 0x2
#define UVM_ACCESS_COUNTER_ON_MANAGED 0x4
#define UVM_ACCESS_COUNTER_ACTION_CLEAR 0x1
#define UVM_ACCESS_COUNTER_PHYS_ON_MANAGED 0x2
// Each page in a tracked physical range may belong to a different VA Block. We
// preallocate an array of reverse map translations. However, access counter
@@ -54,12 +54,6 @@
#define UVM_MAX_TRANSLATION_SIZE (2 * 1024 * 1024ULL)
#define UVM_SUB_GRANULARITY_REGIONS 32
// The GPU offers the following tracking granularities: 64K, 2M, 16M, 16G
//
// Use the largest granularity to minimize the number of access counter
// notifications. This is fine because we simply drop the notifications during
// normal operation, and tests override these values.
static UVM_ACCESS_COUNTER_GRANULARITY g_uvm_access_counter_granularity;
static unsigned g_uvm_access_counter_threshold;
// Per-VA space access counters information
@@ -87,7 +81,6 @@ static int uvm_perf_access_counter_momc_migration_enable = -1;
static unsigned uvm_perf_access_counter_batch_count = UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT;
// See module param documentation below
static char *uvm_perf_access_counter_granularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT;
static unsigned uvm_perf_access_counter_threshold = UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT;
// Module parameters for the tunables
@@ -100,10 +93,6 @@ MODULE_PARM_DESC(uvm_perf_access_counter_momc_migration_enable,
"Whether MOMC access counters will trigger migrations."
"Valid values: <= -1 (default policy), 0 (off), >= 1 (on)");
module_param(uvm_perf_access_counter_batch_count, uint, S_IRUGO);
module_param(uvm_perf_access_counter_granularity, charp, S_IRUGO);
MODULE_PARM_DESC(uvm_perf_access_counter_granularity,
"Size of the physical memory region tracked by each counter. Valid values as"
"of Volta: 64k, 2m, 16m, 16g");
module_param(uvm_perf_access_counter_threshold, uint, S_IRUGO);
MODULE_PARM_DESC(uvm_perf_access_counter_threshold,
"Number of remote accesses on a region required to trigger a notification."
@@ -136,7 +125,7 @@ static va_space_access_counters_info_t *va_space_access_counters_info_get(uvm_va
// Whether access counter migrations are enabled or not. The policy is as
// follows:
// - MIMC migrations are enabled by default on P9 systems with ATS support
// - MIMC migrations are disabled by default on all systems except P9.
// - MOMC migrations are disabled by default on all systems
// - Users can override this policy by specifying on/off
static bool is_migration_enabled(uvm_access_counter_type_t type)
@@ -159,7 +148,10 @@ static bool is_migration_enabled(uvm_access_counter_type_t type)
if (type == UVM_ACCESS_COUNTER_TYPE_MOMC)
return false;
return g_uvm_global.ats.supported;
if (UVM_ATS_IBM_SUPPORTED())
return g_uvm_global.ats.supported;
return false;
}
// Create the access counters tracking struct for the given VA space
@@ -225,30 +217,18 @@ static NV_STATUS config_granularity_to_bytes(UVM_ACCESS_COUNTER_GRANULARITY gran
return NV_OK;
}
// Clear the given access counter and add it to the per-GPU clear tracker
static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *entry)
// Clear the access counter notifications and add it to the per-GPU clear
// tracker.
static NV_STATUS access_counter_clear_notifications(uvm_gpu_t *gpu,
uvm_access_counter_buffer_entry_t **notification_start,
NvU32 num_notifications)
{
NvU32 i;
NV_STATUS status;
uvm_push_t push;
uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
if (entry->address.is_virtual) {
status = uvm_push_begin(gpu->channel_manager,
UVM_CHANNEL_TYPE_MEMOPS,
&push,
"Clear access counter with virtual address: 0x%llx",
entry->address.address);
}
else {
status = uvm_push_begin(gpu->channel_manager,
UVM_CHANNEL_TYPE_MEMOPS,
&push,
"Clear access counter with physical address: 0x%llx:%s",
entry->address.address,
uvm_aperture_string(entry->address.aperture));
}
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &push, "Clear access counter batch");
if (status != NV_OK) {
UVM_ERR_PRINT("Error creating push to clear access counters: %s, GPU %s\n",
nvstatusToString(status),
@@ -256,7 +236,8 @@ static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
return status;
}
gpu->parent->host_hal->access_counter_clear_targeted(&push, entry);
for (i = 0; i < num_notifications; i++)
gpu->parent->host_hal->access_counter_clear_targeted(&push, notification_start[i]);
uvm_push_end(&push);
@@ -381,25 +362,6 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
g_uvm_access_counter_threshold = uvm_perf_access_counter_threshold;
}
if (strcmp(uvm_perf_access_counter_granularity, "64k") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_64K;
}
else if (strcmp(uvm_perf_access_counter_granularity, "2m") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
}
else if (strcmp(uvm_perf_access_counter_granularity, "16m") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16M;
}
else if (strcmp(uvm_perf_access_counter_granularity, "16g") == 0) {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16G;
}
else {
g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
pr_info("Invalid value '%s' for uvm_perf_access_counter_granularity, using '%s' instead",
uvm_perf_access_counter_granularity,
UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT);
}
uvm_assert_mutex_locked(&g_uvm_global.global_lock);
UVM_ASSERT(parent_gpu->access_counter_buffer_hal != NULL);
@@ -422,7 +384,7 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
UVM_ASSERT(access_counters->rm_info.bufferSize %
parent_gpu->access_counter_buffer_hal->entry_size(parent_gpu) == 0);
status = config_granularity_to_bytes(g_uvm_access_counter_granularity, &granularity_bytes);
status = config_granularity_to_bytes(UVM_PERF_ACCESS_COUNTER_GRANULARITY, &granularity_bytes);
UVM_ASSERT(status == NV_OK);
if (granularity_bytes > UVM_MAX_TRANSLATION_SIZE)
UVM_ASSERT(granularity_bytes % UVM_MAX_TRANSLATION_SIZE == 0);
@@ -641,8 +603,8 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac
else {
UvmGpuAccessCntrConfig default_config =
{
.mimcGranularity = g_uvm_access_counter_granularity,
.momcGranularity = g_uvm_access_counter_granularity,
.mimcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
.momcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
.mimcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
.momcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
.threshold = g_uvm_access_counter_threshold,
@@ -767,6 +729,22 @@ static int cmp_sort_virt_notifications_by_instance_ptr(const void *_a, const voi
return cmp_access_counter_instance_ptr(a, b);
}
// Sort comparator for pointers to GVA access counter notification buffer
// entries that sorts by va_space, and fault address.
static int cmp_sort_virt_notifications_by_va_space_address(const void *_a, const void *_b)
{
const uvm_access_counter_buffer_entry_t **a = (const uvm_access_counter_buffer_entry_t **)_a;
const uvm_access_counter_buffer_entry_t **b = (const uvm_access_counter_buffer_entry_t **)_b;
int result;
result = UVM_CMP_DEFAULT((*a)->virtual_info.va_space, (*b)->virtual_info.va_space);
if (result != 0)
return result;
return UVM_CMP_DEFAULT((*a)->address.address, (*b)->address.address);
}
// Sort comparator for pointers to GPA access counter notification buffer
// entries that sorts by physical address' aperture
static int cmp_sort_phys_notifications_by_processor_id(const void *_a, const void *_b)
@@ -924,12 +902,11 @@ static void translate_virt_notifications_instance_ptrs(uvm_gpu_t *gpu,
// GVA notifications provide an instance_ptr and ve_id that can be directly
// translated to a VA space. In order to minimize translations, we sort the
// entries by instance_ptr.
// entries by instance_ptr, va_space and notification address in that order.
static void preprocess_virt_notifications(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context)
{
if (!batch_context->virt.is_single_instance_ptr) {
// Sort by instance_ptr
sort(batch_context->virt.notifications,
batch_context->virt.num_notifications,
sizeof(*batch_context->virt.notifications),
@@ -938,6 +915,12 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
}
translate_virt_notifications_instance_ptrs(gpu, batch_context);
sort(batch_context->virt.notifications,
batch_context->virt.num_notifications,
sizeof(*batch_context->virt.notifications),
cmp_sort_virt_notifications_by_va_space_address,
NULL);
}
// GPA notifications provide a physical address and an aperture. Sort
@@ -946,7 +929,6 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
static void preprocess_phys_notifications(uvm_access_counter_service_batch_context_t *batch_context)
{
if (!batch_context->phys.is_single_aperture) {
// Sort by instance_ptr
sort(batch_context->phys.notifications,
batch_context->phys.num_notifications,
sizeof(*batch_context->phys.notifications),
@@ -955,6 +937,28 @@ static void preprocess_phys_notifications(uvm_access_counter_service_batch_conte
}
}
static NV_STATUS notify_tools_and_process_flags(uvm_gpu_t *gpu,
uvm_access_counter_buffer_entry_t **notification_start,
NvU32 num_entries,
NvU32 flags)
{
NV_STATUS status = NV_OK;
if (uvm_enable_builtin_tests) {
// TODO: Bug 4310744: [UVM][TOOLS] Attribute access counter tools events
// to va_space instead of broadcasting.
NvU32 i;
for (i = 0; i < num_entries; i++)
uvm_tools_broadcast_access_counter(gpu, notification_start[i], flags & UVM_ACCESS_COUNTER_PHYS_ON_MANAGED);
}
if (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR)
status = access_counter_clear_notifications(gpu, notification_start, num_entries);
return status;
}
static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
uvm_va_block_t *va_block,
uvm_va_block_retry_t *va_block_retry,
@@ -1163,7 +1167,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *current_entry,
const uvm_reverse_map_t *reverse_mappings,
size_t num_reverse_mappings,
unsigned *out_flags)
NvU32 *out_flags)
{
size_t index;
uvm_va_block_t *va_block = reverse_mappings[0].va_block;
@@ -1190,7 +1194,6 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
// If an mm is registered with the VA space, we have to retain it
// in order to lock it before locking the VA space.
mm = uvm_va_space_mm_retain_lock(va_space);
uvm_va_space_down_read(va_space);
// Re-check that the VA block is valid after taking the VA block lock.
@@ -1251,7 +1254,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *current_entry,
const uvm_reverse_map_t *reverse_mappings,
size_t num_reverse_mappings,
unsigned *out_flags)
NvU32 *out_flags)
{
NV_STATUS status = NV_OK;
size_t index;
@@ -1259,7 +1262,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
*out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;
for (index = 0; index < num_reverse_mappings; ++index) {
unsigned out_flags_local = 0;
NvU32 out_flags_local = 0;
status = service_phys_single_va_block(gpu,
batch_context,
current_entry,
@@ -1318,7 +1321,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
NvU64 address,
unsigned long sub_granularity,
size_t *num_reverse_mappings,
unsigned *out_flags)
NvU32 *out_flags)
{
NV_STATUS status;
NvU32 region_start, region_end;
@@ -1327,7 +1330,10 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
// Get the reverse_map translations for all the regions set in the
// sub_granularity field of the counter.
for_each_sub_granularity_region(region_start, region_end, sub_granularity, config->sub_granularity_regions_per_translation) {
for_each_sub_granularity_region(region_start,
region_end,
sub_granularity,
config->sub_granularity_regions_per_translation) {
NvU64 local_address = address + region_start * config->sub_granularity_region_size;
NvU32 local_translation_size = (region_end - region_start) * config->sub_granularity_region_size;
uvm_reverse_map_t *local_reverse_mappings = batch_context->phys.translations + *num_reverse_mappings;
@@ -1376,7 +1382,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context,
const uvm_access_counter_buffer_entry_t *current_entry,
unsigned *out_flags)
NvU32 *out_flags)
{
NvU64 address;
NvU64 translation_index;
@@ -1387,7 +1393,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
size_t total_reverse_mappings = 0;
uvm_gpu_t *resident_gpu = NULL;
NV_STATUS status = NV_OK;
unsigned flags = 0;
NvU32 flags = 0;
address = current_entry->address.address;
UVM_ASSERT(address % config->translation_size == 0);
@@ -1415,7 +1421,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
for (translation_index = 0; translation_index < config->translations_per_counter; ++translation_index) {
size_t num_reverse_mappings;
unsigned out_flags_local = 0;
NvU32 out_flags_local = 0;
status = service_phys_notification_translation(gpu,
resident_gpu,
batch_context,
@@ -1437,11 +1443,8 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
sub_granularity = sub_granularity >> config->sub_granularity_regions_per_translation;
}
// Currently we only report events for our tests, not for tools
if (uvm_enable_builtin_tests) {
*out_flags |= UVM_ACCESS_COUNTER_ACTION_NOTIFY;
*out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_ON_MANAGED : 0);
}
if (uvm_enable_builtin_tests)
*out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_PHYS_ON_MANAGED : 0);
if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
*out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
@@ -1454,22 +1457,21 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context)
{
NvU32 i;
uvm_access_counter_buffer_entry_t **notifications = batch_context->phys.notifications;
preprocess_phys_notifications(batch_context);
for (i = 0; i < batch_context->phys.num_notifications; ++i) {
NV_STATUS status;
uvm_access_counter_buffer_entry_t *current_entry = batch_context->phys.notifications[i];
unsigned flags = 0;
uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
NvU32 flags = 0;
if (!UVM_ID_IS_VALID(current_entry->physical_info.resident_id))
continue;
status = service_phys_notification(gpu, batch_context, current_entry, &flags);
if (flags & UVM_ACCESS_COUNTER_ACTION_NOTIFY)
uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
status = access_counter_clear_targeted(gpu, current_entry);
notify_tools_and_process_flags(gpu, &notifications[i], 1, flags);
if (status != NV_OK)
return status;
@@ -1478,152 +1480,218 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
return NV_OK;
}
static int cmp_sort_gpu_phys_addr(const void *_a, const void *_b)
static NV_STATUS service_notification_va_block_helper(struct mm_struct *mm,
uvm_va_block_t *va_block,
uvm_processor_id_t processor,
uvm_access_counter_service_batch_context_t *batch_context)
{
return uvm_gpu_phys_addr_cmp(*(uvm_gpu_phys_address_t*)_a,
*(uvm_gpu_phys_address_t*)_b);
}
uvm_va_block_retry_t va_block_retry;
uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
uvm_service_block_context_t *service_context = &batch_context->block_service_context;
static bool gpu_phys_same_region(uvm_gpu_phys_address_t a, uvm_gpu_phys_address_t b, NvU64 granularity)
{
if (a.aperture != b.aperture)
return false;
UVM_ASSERT(is_power_of_2(granularity));
return UVM_ALIGN_DOWN(a.address, granularity) == UVM_ALIGN_DOWN(b.address, granularity);
}
static bool phys_address_in_accessed_sub_region(uvm_gpu_phys_address_t address,
NvU64 region_size,
NvU64 sub_region_size,
NvU32 accessed_mask)
{
const unsigned accessed_index = (address.address % region_size) / sub_region_size;
// accessed_mask is only filled for tracking granularities larger than 64K
if (region_size == UVM_PAGE_SIZE_64K)
return true;
UVM_ASSERT(accessed_index < 32);
return ((1 << accessed_index) & accessed_mask) != 0;
}
static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context,
const uvm_access_counter_buffer_entry_t *current_entry,
unsigned *out_flags)
{
NV_STATUS status = NV_OK;
NvU64 notification_size;
NvU64 address;
uvm_processor_id_t *resident_processors = batch_context->virt.scratch.resident_processors;
uvm_gpu_phys_address_t *phys_addresses = batch_context->virt.scratch.phys_addresses;
int num_addresses = 0;
int i;
// Virtual address notifications are always 64K aligned
NvU64 region_start = current_entry->address.address;
NvU64 region_end = current_entry->address.address + UVM_PAGE_SIZE_64K;
uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
uvm_access_counter_type_t counter_type = current_entry->counter_type;
const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters, counter_type);
uvm_va_space_t *va_space = current_entry->virtual_info.va_space;
UVM_ASSERT(counter_type == UVM_ACCESS_COUNTER_TYPE_MIMC);
// Entries with NULL va_space are simply dropped.
if (!va_space)
if (uvm_page_mask_empty(accessed_pages))
return NV_OK;
status = config_granularity_to_bytes(config->rm.granularity, &notification_size);
if (status != NV_OK)
return status;
uvm_assert_mutex_locked(&va_block->lock);
// Collect physical locations that could have been touched
// in the reported 64K VA region. The notification mask can
// correspond to any of them.
uvm_va_space_down_read(va_space);
for (address = region_start; address < region_end;) {
uvm_va_block_t *va_block;
service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
service_context->num_retries = 0;
service_context->block_context.mm = mm;
NV_STATUS local_status = uvm_va_block_find(va_space, address, &va_block);
if (local_status == NV_ERR_INVALID_ADDRESS || local_status == NV_ERR_OBJECT_NOT_FOUND) {
address += PAGE_SIZE;
continue;
}
return UVM_VA_BLOCK_RETRY_LOCKED(va_block,
&va_block_retry,
service_va_block_locked(processor,
va_block,
&va_block_retry,
service_context,
accessed_pages));
}
uvm_mutex_lock(&va_block->lock);
while (address < va_block->end && address < region_end) {
const unsigned page_index = uvm_va_block_cpu_page_index(va_block, address);
static void expand_notification_block(struct mm_struct *mm,
uvm_gpu_va_space_t *gpu_va_space,
uvm_va_block_t *va_block,
uvm_page_mask_t *accessed_pages,
const uvm_access_counter_buffer_entry_t *current_entry)
{
NvU64 addr;
NvU64 granularity = 0;
uvm_gpu_t *resident_gpu = NULL;
uvm_processor_id_t resident_id;
uvm_page_index_t page_index;
uvm_gpu_t *gpu = gpu_va_space->gpu;
const uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters,
UVM_ACCESS_COUNTER_TYPE_MIMC);
// UVM va_block always maps the closest resident location to processor
const uvm_processor_id_t res_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
config_granularity_to_bytes(config->rm.granularity, &granularity);
// Add physical location if it's valid and not local vidmem
if (UVM_ID_IS_VALID(res_id) && !uvm_id_equal(res_id, gpu->id)) {
uvm_gpu_phys_address_t phys_address = uvm_va_block_res_phys_page_address(va_block, page_index, res_id, gpu);
if (phys_address_in_accessed_sub_region(phys_address,
notification_size,
config->sub_granularity_region_size,
current_entry->sub_granularity)) {
resident_processors[num_addresses] = res_id;
phys_addresses[num_addresses] = phys_address;
++num_addresses;
}
else {
UVM_DBG_PRINT_RL("Skipping phys address %llx:%s, because it couldn't have been accessed in mask %x",
phys_address.address,
uvm_aperture_string(phys_address.aperture),
current_entry->sub_granularity);
}
}
// Granularities other than 2MB can only be enabled by UVM tests. Do nothing
// in that case.
if (granularity != UVM_PAGE_SIZE_2M)
return;
address += PAGE_SIZE;
}
uvm_mutex_unlock(&va_block->lock);
addr = current_entry->address.address;
uvm_assert_rwsem_locked(&gpu_va_space->va_space->lock);
uvm_assert_mutex_locked(&va_block->lock);
page_index = uvm_va_block_cpu_page_index(va_block, addr);
resident_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
// resident_id might be invalid or might already be the same as the GPU
// which received the notification if the memory was already migrated before
// acquiring the locks either during the servicing of previous notifications
// or during faults or because of explicit migrations or if the VA range was
// freed after receving the notification. Return NV_OK in such cases.
if (!UVM_ID_IS_VALID(resident_id) || uvm_id_equal(resident_id, gpu->id))
return;
if (UVM_ID_IS_GPU(resident_id))
resident_gpu = uvm_va_space_get_gpu(gpu_va_space->va_space, resident_id);
if (uvm_va_block_get_physical_size(va_block, resident_id, page_index) != granularity) {
uvm_page_mask_set(accessed_pages, page_index);
}
uvm_va_space_up_read(va_space);
else {
NvU32 region_start;
NvU32 region_end;
unsigned long sub_granularity = current_entry->sub_granularity;
NvU32 num_regions = config->sub_granularity_regions_per_translation;
NvU32 num_sub_pages = config->sub_granularity_region_size / PAGE_SIZE;
uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
// The addresses need to be sorted to aid coalescing.
sort(phys_addresses,
num_addresses,
sizeof(*phys_addresses),
cmp_sort_gpu_phys_addr,
NULL);
UVM_ASSERT(num_sub_pages >= 1);
for (i = 0; i < num_addresses; ++i) {
uvm_access_counter_buffer_entry_t *fake_entry = &batch_context->virt.scratch.phys_entry;
// Skip the current pointer if the physical region was already handled
if (i > 0 && gpu_phys_same_region(phys_addresses[i - 1], phys_addresses[i], notification_size)) {
UVM_ASSERT(uvm_id_equal(resident_processors[i - 1], resident_processors[i]));
continue;
// region_start and region_end refer to sub_granularity indices, not
// page_indices.
for_each_sub_granularity_region(region_start, region_end, sub_granularity, num_regions) {
uvm_page_mask_region_fill(accessed_pages,
uvm_va_block_region(region_start * num_sub_pages,
region_end * num_sub_pages));
}
UVM_DBG_PRINT_RL("Faking MIMC address[%i/%i]: %llx (granularity mask: %llx) in aperture %s on device %s\n",
i,
num_addresses,
phys_addresses[i].address,
notification_size - 1,
uvm_aperture_string(phys_addresses[i].aperture),
uvm_gpu_name(gpu));
// Construct a fake phys addr AC entry
fake_entry->counter_type = current_entry->counter_type;
fake_entry->address.address = UVM_ALIGN_DOWN(phys_addresses[i].address, notification_size);
fake_entry->address.aperture = phys_addresses[i].aperture;
fake_entry->address.is_virtual = false;
fake_entry->physical_info.resident_id = resident_processors[i];
fake_entry->counter_value = current_entry->counter_value;
fake_entry->sub_granularity = current_entry->sub_granularity;
// Remove pages in the va_block which are not resident on resident_id.
// If the GPU is heavily accessing those pages, future access counter
// migrations will migrate them to the GPU.
uvm_page_mask_and(accessed_pages, accessed_pages, resident_mask);
}
}
status = service_phys_notification(gpu, batch_context, fake_entry, out_flags);
if (status != NV_OK)
static NV_STATUS service_virt_notifications_in_block(struct mm_struct *mm,
uvm_gpu_va_space_t *gpu_va_space,
uvm_va_block_t *va_block,
uvm_access_counter_service_batch_context_t *batch_context,
NvU32 index,
NvU32 *out_index)
{
NvU32 i = index;
NvU32 flags = 0;
NV_STATUS status = NV_OK;
NV_STATUS flags_status;
uvm_gpu_t *gpu = gpu_va_space->gpu;
uvm_va_space_t *va_space = gpu_va_space->va_space;
uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
uvm_access_counter_buffer_entry_t **notifications = batch_context->virt.notifications;
UVM_ASSERT(va_block);
UVM_ASSERT(i < batch_context->virt.num_notifications);
uvm_assert_rwsem_locked(&va_space->lock);
uvm_page_mask_zero(accessed_pages);
uvm_mutex_lock(&va_block->lock);
while (i < batch_context->virt.num_notifications) {
uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
NvU64 address = current_entry->address.address;
if ((current_entry->virtual_info.va_space != va_space) || (address > va_block->end)) {
*out_index = i;
break;
}
expand_notification_block(mm, gpu_va_space, va_block, accessed_pages, current_entry);
i++;
*out_index = i;
}
status = service_notification_va_block_helper(mm, va_block, gpu->id, batch_context);
uvm_mutex_unlock(&va_block->lock);
// Atleast one notification should have been processed.
UVM_ASSERT(index < *out_index);
if (status == NV_OK)
flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
flags_status = notify_tools_and_process_flags(gpu, &notifications[index], *out_index - index, flags);
if ((status == NV_OK) && (flags_status != NV_OK))
status = flags_status;
return status;
}
static NV_STATUS service_virt_notifications_batch(struct mm_struct *mm,
uvm_gpu_va_space_t *gpu_va_space,
uvm_access_counter_service_batch_context_t *batch_context,
NvU32 index,
NvU32 *out_index)
{
NV_STATUS status;
uvm_va_block_t *va_block;
uvm_va_space_t *va_space = gpu_va_space->va_space;
uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[index];
NvU64 address = current_entry->address.address;
UVM_ASSERT(va_space);
uvm_assert_rwsem_locked(&va_space->lock);
// Virtual address notifications are always 64K aligned
UVM_ASSERT(IS_ALIGNED(address, UVM_PAGE_SIZE_64K));
// TODO: Bug 4309292: [UVM][HMM] Re-enable access counter HMM block
// migrations for virtual notifications on configs with
// 4KB page size
status = uvm_va_block_find(va_space, address, &va_block);
if ((status == NV_OK) && !uvm_va_block_is_hmm(va_block)) {
UVM_ASSERT(va_block);
status = service_virt_notifications_in_block(mm, gpu_va_space, va_block, batch_context, index, out_index);
}
else {
NvU32 flags = 0;
UVM_ASSERT((status == NV_ERR_OBJECT_NOT_FOUND) ||
(status == NV_ERR_INVALID_ADDRESS) ||
uvm_va_block_is_hmm(va_block));
// NV_ERR_OBJECT_NOT_FOUND is returned if the VA range is valid but no
// VA block has been allocated yet. This can happen if there are stale
// notifications in the batch. A new VA range may have been allocated in
// that range. So, clear the notification entry to continue getting
// notifications for the new VA range.
if (status == NV_ERR_OBJECT_NOT_FOUND)
flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
// NV_ERR_INVALID_ADDRESS is returned if the corresponding VA range
// doesn't exist or it's not a managed range. Access counter migrations
// are not currently supported on such ranges.
//
// TODO: Bug 1990466: [uvm] Use access counters to trigger migrations
// When support for SAM migrations is addded, clear the notification
// entry if the VA range doesn't exist in order to receive notifications
// when a new VA range is allocated in that region.
status = notify_tools_and_process_flags(gpu_va_space->gpu, &batch_context->virt.notifications[index], 1, flags);
*out_index = index + 1;
status = NV_OK;
}
return status;
@@ -1632,33 +1700,67 @@ static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
uvm_access_counter_service_batch_context_t *batch_context)
{
NvU32 i;
NvU32 i = 0;
NV_STATUS status = NV_OK;
struct mm_struct *mm = NULL;
uvm_va_space_t *va_space = NULL;
uvm_va_space_t *prev_va_space = NULL;
uvm_gpu_va_space_t *gpu_va_space = NULL;
// TODO: Bug 4299018 : Add support for virtual access counter migrations on
// 4K page sizes.
if (PAGE_SIZE == UVM_PAGE_SIZE_4K) {
return notify_tools_and_process_flags(gpu,
batch_context->virt.notifications,
batch_context->virt.num_notifications,
0);
}
preprocess_virt_notifications(gpu, batch_context);
for (i = 0; i < batch_context->virt.num_notifications; ++i) {
unsigned flags = 0;
while (i < batch_context->virt.num_notifications) {
uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[i];
va_space = current_entry->virtual_info.va_space;
status = service_virt_notification(gpu, batch_context, current_entry, &flags);
if (va_space != prev_va_space) {
UVM_DBG_PRINT_RL("Processed virt access counter (%d/%d): %sMANAGED (status: %d) clear: %s\n",
i + 1,
batch_context->virt.num_notifications,
(flags & UVM_ACCESS_COUNTER_ON_MANAGED) ? "" : "NOT ",
status,
(flags & UVM_ACCESS_COUNTER_ACTION_CLEAR) ? "YES" : "NO");
// New va_space detected, drop locks of the old va_space.
if (prev_va_space) {
uvm_va_space_up_read(prev_va_space);
uvm_va_space_mm_release_unlock(prev_va_space, mm);
if (uvm_enable_builtin_tests)
uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
mm = NULL;
gpu_va_space = NULL;
}
if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
status = access_counter_clear_targeted(gpu, current_entry);
// Acquire locks for the new va_space.
if (va_space) {
mm = uvm_va_space_mm_retain_lock(va_space);
uvm_va_space_down_read(va_space);
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
}
prev_va_space = va_space;
}
if (va_space && gpu_va_space && uvm_va_space_has_access_counter_migrations(va_space)) {
status = service_virt_notifications_batch(mm, gpu_va_space, batch_context, i, &i);
}
else {
status = notify_tools_and_process_flags(gpu, &batch_context->virt.notifications[i], 1, 0);
i++;
}
if (status != NV_OK)
break;
}
if (va_space) {
uvm_va_space_up_read(va_space);
uvm_va_space_mm_release_unlock(va_space, mm);
}
return status;
}
@@ -1941,6 +2043,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
}
else {
uvm_access_counter_buffer_entry_t entry = { 0 };
uvm_access_counter_buffer_entry_t *notification = &entry;
if (params->counter_type == UVM_TEST_ACCESS_COUNTER_TYPE_MIMC)
entry.counter_type = UVM_ACCESS_COUNTER_TYPE_MIMC;
@@ -1950,7 +2053,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
entry.bank = params->bank;
entry.tag = params->tag;
status = access_counter_clear_targeted(gpu, &entry);
status = access_counter_clear_notifications(gpu, &notification, 1);
}
if (status == NV_OK)

View File

@@ -235,17 +235,27 @@ static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *par
return NV_OK;
}
// In SRIOV, the UVM (guest) driver does not have access to the privileged
// registers used to clear the faulted bit. Instead, UVM requests host RM to do
// the clearing on its behalf, using a SW method.
static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
{
if (uvm_gpu_is_virt_mode_sriov(gpu)) {
UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
return true;
}
// If true, UVM uses a SW method to request RM to do the clearing on its
// behalf.
bool use_sw_method = false;
return false;
// In SRIOV, the UVM (guest) driver does not have access to the privileged
// registers used to clear the faulted bit.
if (uvm_gpu_is_virt_mode_sriov(gpu))
use_sw_method = true;
// In Confidential Computing access to the privileged registers is blocked,
// in order to prevent interference between guests, or between the
// (untrusted) host and the guests.
if (g_uvm_global.conf_computing_enabled)
use_sw_method = true;
if (use_sw_method)
UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
return use_sw_method;
}
static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
@@ -570,7 +580,7 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;
ats_invalidate->write_faults_in_batch = false;
ats_invalidate->tlb_batch_pending = false;
va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2022 NVIDIA Corporation
Copyright (c) 2015-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -362,7 +362,8 @@ static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
"Cancel targeting instance_ptr {0x%llx:%s}\n",
instance_ptr.address,
uvm_aperture_string(instance_ptr.aperture));
} else {
}
else {
status = uvm_push_begin_acquire(gpu->channel_manager,
UVM_CHANNEL_TYPE_MEMOPS,
&replayable_faults->replay_tracker,
@@ -697,9 +698,6 @@ static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_ty
typedef enum
{
// Fetch a batch of faults from the buffer.
FAULT_FETCH_MODE_BATCH_ALL,
// Fetch a batch of faults from the buffer. Stop at the first entry that is
// not ready yet
FAULT_FETCH_MODE_BATCH_READY,
@@ -857,9 +855,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,
// written out of order
UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
// We have some entry to work on. Let's do the rest later.
if (fetch_mode != FAULT_FETCH_MODE_ALL &&
fetch_mode != FAULT_FETCH_MODE_BATCH_ALL &&
fault_index > 0)
if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0)
goto done;
}
@@ -888,6 +884,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,
current_entry->va_space = NULL;
current_entry->filtered = false;
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) {
UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count);
@@ -1184,7 +1181,11 @@ static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
fault_entry->replayable.cancel_va_mode = cancel_va_mode;
utlb->has_fatal_faults = true;
batch_context->has_fatal_faults = true;
if (!batch_context->fatal_va_space) {
UVM_ASSERT(fault_entry->va_space);
batch_context->fatal_va_space = fault_entry->va_space;
}
}
static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
@@ -1378,7 +1379,10 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
UVM_ASSERT(current_entry->fault_access_type ==
uvm_fault_access_type_mask_highest(current_entry->access_type_mask));
current_entry->is_fatal = false;
// Unserviceable faults were already skipped by the caller. There are no
// unserviceable fault types that could be in the same VA block as a
// serviceable fault.
UVM_ASSERT(!current_entry->is_fatal);
current_entry->is_throttled = false;
current_entry->is_invalid_prefetch = false;
@@ -1512,7 +1516,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
++block_context->num_retries;
if (status == NV_OK && batch_context->has_fatal_faults)
if (status == NV_OK && batch_context->fatal_va_space)
status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);
return status;
@@ -1676,7 +1680,8 @@ static NV_STATUS service_fault_batch_ats_sub_vma(uvm_gpu_va_space_t *gpu_va_spac
if (access_type <= UVM_FAULT_ACCESS_TYPE_READ) {
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
}
else if (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) {
else {
UVM_ASSERT(access_type >= UVM_FAULT_ACCESS_TYPE_WRITE);
if (uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ) &&
!uvm_page_mask_test(reads_serviced_mask, page_index))
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
@@ -1735,6 +1740,10 @@ static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space,
uvm_fault_access_type_t access_type = current_entry->fault_access_type;
bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
// ATS faults can't be unserviceable, since unserviceable faults require
// GMMU PTEs.
UVM_ASSERT(!current_entry->is_fatal);
i++;
update_batch_and_notify_fault(gpu_va_space->gpu,
@@ -1934,14 +1943,198 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
return status;
}
// Called when a fault in the batch has been marked fatal. Flush the buffer
// under the VA and mmap locks to remove any potential stale fatal faults, then
// service all new faults for just that VA space and cancel those which are
// fatal. Faults in other VA spaces are replayed when done and will be processed
// when normal fault servicing resumes.
static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
NV_STATUS status = NV_OK;
NvU32 i;
uvm_va_space_t *va_space = batch_context->fatal_va_space;
uvm_gpu_va_space_t *gpu_va_space = NULL;
struct mm_struct *mm;
uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
uvm_va_block_context_t *va_block_context = &service_context->block_context;
UVM_ASSERT(gpu->parent->replayable_faults_supported);
UVM_ASSERT(va_space);
// Perform the flush and re-fetch while holding the mmap_lock and the
// VA space lock. This avoids stale faults because it prevents any vma
// modifications (mmap, munmap, mprotect) from happening between the time HW
// takes the fault and we cancel it.
mm = uvm_va_space_mm_retain_lock(va_space);
va_block_context->mm = mm;
uvm_va_space_down_read(va_space);
// We saw fatal faults in this VA space before. Flush while holding
// mmap_lock to make sure those faults come back (aren't stale).
//
// We need to wait until all old fault messages have arrived before
// flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
if (status != NV_OK)
goto done;
// Wait for the flush's replay to finish to give the legitimate faults a
// chance to show up in the buffer again.
status = uvm_tracker_wait(&replayable_faults->replay_tracker);
if (status != NV_OK)
goto done;
// We expect all replayed faults to have arrived in the buffer so we can re-
// service them. The replay-and-wait sequence above will ensure they're all
// in the HW buffer. When GSP owns the HW buffer, we also have to wait for
// GSP to copy all available faults from the HW buffer into the shadow
// buffer.
//
// TODO: Bug 2533557: This flush does not actually guarantee that GSP will
// copy over all faults.
status = hw_fault_buffer_flush_locked(gpu->parent);
if (status != NV_OK)
goto done;
// If there is no GPU VA space for the GPU, ignore all faults in the VA
// space. This can happen if the GPU VA space has been destroyed since we
// unlocked the VA space in service_fault_batch. That means the fatal faults
// are stale, because unregistering the GPU VA space requires preempting the
// context and detaching all channels in that VA space. Restart fault
// servicing from the top.
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
if (!gpu_va_space)
goto done;
// Re-parse the new faults
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_duplicate_faults = 0;
batch_context->num_replays = 0;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
if (status != NV_OK)
goto done;
// No more faults left. Either the previously-seen fatal entry was stale, or
// RM killed the context underneath us.
if (batch_context->num_cached_faults == 0)
goto done;
++batch_context->batch_id;
status = preprocess_fault_batch(gpu, batch_context);
if (status != NV_OK) {
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
// Another flush happened due to stale faults or a context-fatal
// error. The previously-seen fatal fault might not exist anymore,
// so restart fault servicing from the top.
status = NV_OK;
}
goto done;
}
// Search for the target VA space
for (i = 0; i < batch_context->num_coalesced_faults; i++) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
UVM_ASSERT(current_entry->va_space);
if (current_entry->va_space == va_space)
break;
}
while (i < batch_context->num_coalesced_faults) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
if (current_entry->va_space != va_space)
break;
// service_fault_batch_dispatch() doesn't expect unserviceable faults.
// Just cancel them directly.
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
if (status != NV_OK)
break;
++i;
}
else {
uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
NvU32 block_faults;
ats_invalidate->tlb_batch_pending = false;
uvm_hmm_service_context_init(service_context);
// Service all the faults that we can. We only really need to search
// for fatal faults, but attempting to service all is the easiest
// way to do that.
status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false);
if (status != NV_OK) {
// TODO: Bug 3900733: clean up locking in service_fault_batch().
// We need to drop lock and retry. That means flushing and
// starting over.
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
status = NV_OK;
break;
}
// Invalidate TLBs before cancel to ensure that fatal faults don't
// get stuck in HW behind non-fatal faults to the same line.
status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
if (status != NV_OK)
break;
while (block_faults-- > 0) {
current_entry = batch_context->ordered_fault_cache[i];
if (current_entry->is_fatal) {
status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
if (status != NV_OK)
break;
}
++i;
}
}
}
done:
uvm_va_space_up_read(va_space);
uvm_va_space_mm_release_unlock(va_space, mm);
if (status == NV_OK) {
// There are two reasons to flush the fault buffer here.
//
// 1) Functional. We need to replay both the serviced non-fatal faults
// and the skipped faults in other VA spaces. The former need to be
// restarted and the latter need to be replayed so the normal fault
// service mechanism can fetch and process them.
//
// 2) Performance. After cancelling the fatal faults, a flush removes
// any potential duplicated fault that may have been added while
// processing the faults in this batch. This flush also avoids doing
// unnecessary processing after the fatal faults have been cancelled,
// so all the rest are unlikely to remain after a replay because the
// context is probably in the process of dying.
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
}
return status;
}
// Scan the ordered view of faults and group them by different va_blocks
// (managed faults) and service faults for each va_block, in batch.
// Service non-managed faults one at a time as they are encountered during the
// scan.
//
// This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
// was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
// space
// Fatal faults are marked for later processing by the caller.
static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
fault_service_mode_t service_mode,
uvm_fault_service_batch_context_t *batch_context)
@@ -1960,7 +2153,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
UVM_ASSERT(gpu->parent->replayable_faults_supported);
ats_invalidate->write_faults_in_batch = false;
ats_invalidate->tlb_batch_pending = false;
uvm_hmm_service_context_init(service_context);
for (i = 0; i < batch_context->num_coalesced_faults;) {
@@ -1995,38 +2188,25 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
va_block_context->mm = mm;
uvm_va_space_down_read(va_space);
gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
batch_context);
if (status == NV_OK)
status = NV_WARN_MORE_PROCESSING_REQUIRED;
break;
}
// The case where there is no valid GPU VA space for the GPU in this
// VA space is handled next
}
// Some faults could be already fatal if they cannot be handled by
// the UVM driver
if (current_entry->is_fatal) {
++i;
batch_context->has_fatal_faults = true;
if (!batch_context->fatal_va_space)
batch_context->fatal_va_space = va_space;
utlb->has_fatal_faults = true;
UVM_ASSERT(utlb->num_pending_faults > 0);
continue;
}
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
if (!gpu_va_space) {
// If there is no GPU VA space for the GPU, ignore the fault. This
// can happen if a GPU VA space is destroyed without explicitly
// freeing all memory ranges (destroying the VA range triggers a
// flush of the fault buffer) and there are stale entries in the
// freeing all memory ranges and there are stale entries in the
// buffer that got fixed by the servicing in a previous batch.
++i;
continue;
@@ -2044,15 +2224,17 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
uvm_va_space_mm_release_unlock(va_space, mm);
mm = NULL;
va_space = NULL;
status = NV_OK;
continue;
}
if (status != NV_OK)
goto fail;
i += block_faults;
// Don't issue replays in cancel mode
if (replay_per_va_block && !batch_context->has_fatal_faults) {
if (replay_per_va_block && !batch_context->fatal_va_space) {
status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
if (status != NV_OK)
goto fail;
@@ -2064,8 +2246,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
}
}
// Only clobber status if invalidate_status != NV_OK, since status may also
// contain NV_WARN_MORE_PROCESSING_REQUIRED.
if (va_space != NULL) {
NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
if (invalidate_status != NV_OK)
@@ -2273,77 +2453,48 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
return false;
}
typedef enum
{
// Only cancel faults flagged as fatal
FAULT_CANCEL_MODE_FATAL,
// Cancel all faults in the batch unconditionally
FAULT_CANCEL_MODE_ALL,
} fault_cancel_mode_t;
// Cancel faults in the given fault service batch context. The function provides
// two different modes depending on the value of cancel_mode:
// - If cancel_mode == FAULT_CANCEL_MODE_FATAL, only faults flagged as fatal
// will be cancelled. In this case, the reason reported to tools is the one
// contained in the fault entry itself.
// - If cancel_mode == FAULT_CANCEL_MODE_ALL, all faults will be cancelled
// unconditionally. In this case, the reason reported to tools for non-fatal
// faults is the one passed to this function.
static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
uvm_fault_service_batch_context_t *batch_context,
fault_cancel_mode_t cancel_mode,
UvmEventFatalReason reason)
// Cancel all faults in the given fault service batch context, even those not
// marked as fatal.
static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
uvm_fault_service_batch_context_t *batch_context,
UvmEventFatalReason reason)
{
NV_STATUS status = NV_OK;
NV_STATUS fault_status;
uvm_va_space_t *va_space = NULL;
NvU32 i;
NvU32 i = 0;
UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
if (cancel_mode == FAULT_CANCEL_MODE_ALL)
UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
while (i < batch_context->num_coalesced_faults && status == NV_OK) {
uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
uvm_va_space_t *va_space = current_entry->va_space;
bool skip_va_space;
UVM_ASSERT(current_entry->va_space);
UVM_ASSERT(va_space);
if (current_entry->va_space != va_space) {
// Fault on a different va_space, drop the lock of the old one...
if (va_space != NULL)
uvm_va_space_up_read(va_space);
uvm_va_space_down_read(va_space);
va_space = current_entry->va_space;
// If there is no GPU VA space for the GPU, ignore all faults in
// that VA space. This can happen if the GPU VA space has been
// destroyed since we unlocked the VA space in service_fault_batch.
// Ignoring the fault avoids targetting a PDB that might have been
// reused by another process.
skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
// ... and take the lock of the new one
uvm_va_space_down_read(va_space);
for (;
i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
current_entry = batch_context->ordered_fault_cache[++i]) {
uvm_fault_cancel_va_mode_t cancel_va_mode;
// We don't need to check whether a buffer flush is required
// (due to VA range destruction).
// - For cancel_mode == FAULT_CANCEL_MODE_FATAL, once a fault is
// flagged as fatal we need to cancel it, even if its VA range no
// longer exists.
// - For cancel_mode == FAULT_CANCEL_MODE_ALL we don't care about
// any of this, we just want to trigger RC in RM.
}
if (skip_va_space)
continue;
if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
// If there is no GPU VA space for the GPU, ignore the fault.
// This can happen if the GPU VA did not exist in
// service_fault_batch(), or it was destroyed since then.
// This is to avoid targetting a PDB that might have been reused
// by another process.
continue;
}
// Cancel the fault
if (cancel_mode == FAULT_CANCEL_MODE_ALL || current_entry->is_fatal) {
uvm_fault_cancel_va_mode_t cancel_va_mode = current_entry->replayable.cancel_va_mode;
// If cancelling unconditionally and the fault was not fatal,
// set the cancel reason passed to this function
if (!current_entry->is_fatal) {
if (current_entry->is_fatal) {
UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
cancel_va_mode = current_entry->replayable.cancel_va_mode;
}
else {
current_entry->fatal_reason = reason;
cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
}
@@ -2352,17 +2503,13 @@ static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
if (status != NV_OK)
break;
}
uvm_va_space_up_read(va_space);
}
if (va_space != NULL)
uvm_va_space_up_read(va_space);
// After cancelling the fatal faults, the fault buffer is flushed to remove
// any potential duplicated fault that may have been added while processing
// the faults in this batch. This flush also avoids doing unnecessary
// processing after the fatal faults have been cancelled, so all the rest
// are unlikely to remain after a replay because the context is probably in
// the process of dying.
// Because each cancel itself triggers a replay, there may be a large number
// of new duplicated faults in the buffer after cancelling all the known
// ones. Flushing the buffer discards them to avoid unnecessary processing.
fault_status = fault_buffer_flush_locked(gpu,
UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
UVM_FAULT_REPLAY_TYPE_START,
@@ -2410,12 +2557,12 @@ static void cancel_fault_batch(uvm_gpu_t *gpu,
uvm_fault_service_batch_context_t *batch_context,
UvmEventFatalReason reason)
{
if (gpu->parent->fault_cancel_va_supported) {
cancel_faults_precise_va(gpu, batch_context, FAULT_CANCEL_MODE_ALL, reason);
return;
}
cancel_fault_batch_tlb(gpu, batch_context, reason);
// Return code is ignored since we're on a global error path and wouldn't be
// able to recover anyway.
if (gpu->parent->fault_cancel_va_supported)
cancel_faults_all(gpu, batch_context, reason);
else
cancel_fault_batch_tlb(gpu, batch_context, reason);
}
@@ -2502,7 +2649,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_replays = 0;
batch_context->has_fatal_faults = false;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
// 5) Fetch all faults from buffer
@@ -2549,9 +2696,6 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
// 8) Service all non-fatal faults and mark all non-serviceable faults
// as fatal
status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
continue;
UVM_ASSERT(batch_context->num_replays == 0);
if (status == NV_ERR_NO_MEMORY)
continue;
@@ -2559,7 +2703,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
break;
// No more fatal faults left, we are done
if (!batch_context->has_fatal_faults)
if (!batch_context->fatal_va_space)
break;
// 9) Search for uTLBs that contain fatal faults and meet the
@@ -2581,13 +2725,9 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
{
UVM_ASSERT(batch_context->has_fatal_faults);
if (gpu->parent->fault_cancel_va_supported) {
return cancel_faults_precise_va(gpu,
batch_context,
FAULT_CANCEL_MODE_FATAL,
UvmEventFatalReasonInvalid);
}
UVM_ASSERT(batch_context->fatal_va_space);
if (gpu->parent->fault_cancel_va_supported)
return service_fault_batch_for_cancel(gpu, batch_context);
return cancel_faults_precise_tlb(gpu, batch_context);
}
@@ -2643,7 +2783,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
batch_context->num_invalid_prefetch_faults = 0;
batch_context->num_duplicate_faults = 0;
batch_context->num_replays = 0;
batch_context->has_fatal_faults = false;
batch_context->fatal_va_space = NULL;
batch_context->has_throttled_faults = false;
status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
@@ -2671,9 +2811,6 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
// was flushed
num_replays += batch_context->num_replays;
if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
continue;
enable_disable_prefetch_faults(gpu->parent, batch_context);
if (status != NV_OK) {
@@ -2687,10 +2824,17 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
break;
}
if (batch_context->has_fatal_faults) {
if (batch_context->fatal_va_space) {
status = uvm_tracker_wait(&batch_context->tracker);
if (status == NV_OK)
if (status == NV_OK) {
status = cancel_faults_precise(gpu, batch_context);
if (status == NV_OK) {
// Cancel handling should've issued at least one replay
UVM_ASSERT(batch_context->num_replays > 0);
++num_batches;
continue;
}
}
break;
}

View File

@@ -103,5 +103,7 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->map_remap_larger_page_promotion = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = true;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2020-2022 NVIDIA Corporation
Copyright (c) 2020-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -33,6 +33,7 @@
#include "uvm_types.h"
#include "uvm_global.h"
#include "uvm_common.h"
#include "uvm_hal.h"
#include "uvm_hal_types.h"
#include "uvm_hopper_fault_buffer.h"
@@ -42,6 +43,10 @@
#define MMU_BIG 0
#define MMU_SMALL 1
// Used in pde_pcf().
#define ATS_ALLOWED 0
#define ATS_NOT_ALLOWED 1
uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
{
if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST44)
@@ -260,7 +265,108 @@ static NvU64 poisoned_pte_hopper(void)
return WRITE_HWCONST64(pte_bits, _MMU_VER3, PTE, PCF, PRIVILEGE_RO_NO_ATOMIC_UNCACHED_ACD);
}
static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 depth)
typedef enum
{
PDE_TYPE_SINGLE,
PDE_TYPE_DUAL_BIG,
PDE_TYPE_DUAL_SMALL,
PDE_TYPE_COUNT,
} pde_type_t;
static const NvU8 valid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_ALLOWED,
NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_NOT_ALLOWED } };
static const NvU8 invalid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_INVALID_ATS_ALLOWED,
NV_MMU_VER3_PDE_PCF_INVALID_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_NOT_ALLOWED },
{ NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_ALLOWED,
NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_NOT_ALLOWED } };
static const NvU8 va_base[] = { 56, 47, 38, 29, 21 };
static bool is_ats_range_valid(uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_base_va;
NvU64 min_va_upper;
NvU64 max_va_lower;
NvU32 index_in_dir;
uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
UVM_ASSERT(dir->depth < ARRAY_SIZE(va_base));
// We can use UVM_PAGE_SIZE_AGNOSTIC because page_size is only used in
// index_bits_hopper() for PTE table, i.e., depth 5+, which does not use a
// PDE PCF or an ATS_ALLOWED/NOT_ALLOWED setting.
UVM_ASSERT(child_index < (1ull << index_bits_hopper(dir->depth, UVM_PAGE_SIZE_AGNOSTIC)));
pde_base_va = 0;
index_in_dir = child_index;
while (dir) {
pde_base_va += index_in_dir * (1ull << va_base[dir->depth]);
index_in_dir = dir->index_in_parent;
dir = dir->host_parent;
}
pde_base_va = (NvU64)((NvS64)(pde_base_va << (64 - num_va_bits_hopper())) >> (64 - num_va_bits_hopper()));
if (pde_base_va < max_va_lower || pde_base_va >= min_va_upper)
return true;
return false;
}
// PDE Permission Control Flags
static NvU32 pde_pcf(bool valid, pde_type_t pde_type, uvm_page_directory_t *dir, NvU32 child_index)
{
const NvU8 (*pcf)[2] = valid ? valid_pcf : invalid_pcf;
NvU8 depth = dir->depth;
UVM_ASSERT(pde_type < PDE_TYPE_COUNT);
UVM_ASSERT(depth < 5);
// On non-ATS systems, PDE PCF only sets the valid and volatile/cache bits.
if (!g_uvm_global.ats.enabled)
return pcf[pde_type][ATS_ALLOWED];
// We assume all supported ATS platforms use canonical form address.
// See comments in uvm_gpu.c:uvm_gpu_can_address() and in
// uvm_mmu.c:page_tree_ats_init();
UVM_ASSERT(uvm_platform_uses_canonical_form_address());
// Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
// ATS and GMMU page tables. For managed memory we need to prevent this
// parallel lookup since we would not get any GPU fault if the CPU has
// a valid mapping. Also, for external ranges that are known to be
// mapped entirely on the GMMU page table we can skip the ATS lookup
// for performance reasons. Parallel ATS lookup is disabled in PDE1
// (depth 3) and, therefore, it applies to the underlying 512MB VA
// range.
//
// UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
// This is fine because CUDA ensures that all managed and external
// allocations are properly compartmentalized in 512MB-aligned VA
// regions. For cudaHostRegister CUDA cannot control the VA range, but
// we rely on ATS for those allocations so they can't choose the
// ATS_NOT_ALLOWED mode.
// TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range to
// PTEs.
// HW complies with the leaf PDE's ATS_ALLOWED/ATS_NOT_ALLOWED settings,
// enabling us to treat any upper-level PDE as a don't care as long as there
// are leaf PDEs for the entire upper-level PDE range. We assume PDE4
// entries (depth == 0) are always ATS enabled, and the no_ats_range is in
// PDE3 or lower.
if (depth == 0 || (!valid && is_ats_range_valid(dir, child_index)))
return pcf[pde_type][ATS_ALLOWED];
return pcf[pde_type][ATS_NOT_ALLOWED];
}
static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_bits = 0;
@@ -280,38 +386,17 @@ static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 dep
break;
}
// PCF (permission control flags) 5:3
// Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
// ATS and GMMU page tables. For managed memory we need to prevent this
// parallel lookup since we would not get any GPU fault if the CPU has
// a valid mapping. Also, for external ranges that are known to be
// mapped entirely on the GMMU page table we can skip the ATS lookup
// for performance reasons. Parallel ATS lookup is disabled in PDE1
// (depth 3) and, therefore, it applies to the underlying 512MB VA
// range.
//
// UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
// This is fine because CUDA ensures that all managed and external
// allocations are properly compartmentalized in 512MB-aligned VA
// regions. For cudaHostRegister CUDA cannot control the VA range, but
// we rely on ATS for those allocations so they can't choose the
// ATS_NOT_ALLOWED mode.
//
// TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range
// to PTEs.
if (depth == 3 && g_uvm_global.ats.enabled)
pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_NOT_ALLOWED);
else
pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_ALLOWED);
// address 51:12
pde_bits |= HWVALUE64(_MMU_VER3, PDE, ADDRESS, address);
}
// PCF (permission control flags) 5:3
pde_bits |= HWVALUE64(_MMU_VER3, PDE, PCF, pde_pcf(phys_alloc != NULL, PDE_TYPE_SINGLE, dir, child_index));
return pde_bits;
}
static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_bits = 0;
@@ -330,17 +415,20 @@ static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
break;
}
// PCF (permission control flags) 5:3
pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_BIG, VALID_UNCACHED_ATS_NOT_ALLOWED);
// address 51:8
pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_BIG, address);
}
// PCF (permission control flags) 5:3
pde_bits |= HWVALUE64(_MMU_VER3,
DUAL_PDE,
PCF_BIG,
pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_BIG, dir, child_index));
return pde_bits;
}
static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
{
NvU64 pde_bits = 0;
@@ -359,29 +447,40 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
break;
}
// PCF (permission control flags) 69:67 [5:3]
pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_SMALL, VALID_UNCACHED_ATS_NOT_ALLOWED);
// address 115:76 [51:12]
pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_SMALL, address);
}
// PCF (permission control flags) 69:67 [5:3]
pde_bits |= HWVALUE64(_MMU_VER3,
DUAL_PDE,
PCF_SMALL,
pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_SMALL, dir, child_index));
return pde_bits;
}
static void make_pde_hopper(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
static void make_pde_hopper(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU32 entry_count = entries_per_index_hopper(depth);
NvU32 entry_count;
NvU64 *entry_bits = (NvU64 *)entry;
UVM_ASSERT(dir);
entry_count = entries_per_index_hopper(dir->depth);
if (entry_count == 1) {
*entry_bits = single_pde_hopper(*phys_allocs, depth);
*entry_bits = single_pde_hopper(*phys_allocs, dir, child_index);
}
else if (entry_count == 2) {
entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG]);
entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL]);
entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG], dir, child_index);
entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL], dir, child_index);
// This entry applies to the whole dual PDE but is stored in the lower
// bits
// bits.
entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER3, DUAL_PDE, IS_PTE, FALSE);
}
else {

View File

@@ -128,8 +128,9 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
// present if we see the callback.
//
// The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
// v3.19 (2014-11-13).
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
#define UVM_CAN_USE_MMU_NOTIFIERS() 1
#else
#define UVM_CAN_USE_MMU_NOTIFIERS() 0
@@ -153,10 +154,6 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
#define VM_MIXEDMAP 0x00000000
#endif
#if !defined(MPOL_PREFERRED_MANY)
#define MPOL_PREFERRED_MANY 5
#endif
//
// printk.h already defined pr_fmt, so we have to redefine it so the pr_*
// routines pick up our version

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2021 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -71,4 +71,6 @@ void uvm_hal_maxwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = false;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2021 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -106,10 +106,16 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
return pde_bits;
}
static void make_pde_maxwell(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
static void make_pde_maxwell(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU64 pde_bits = 0;
UVM_ASSERT(depth == 0);
UVM_ASSERT(dir);
UVM_ASSERT(dir->depth == 0);
pde_bits |= HWCONST64(_MMU, PDE, SIZE, FULL);
pde_bits |= big_half_pde_maxwell(phys_allocs[MMU_BIG]) | small_half_pde_maxwell(phys_allocs[MMU_SMALL]);

View File

@@ -672,14 +672,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
.finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
};
// WAR for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
//
// This code path isn't used on GH180 but we need to maintain consistent
// behaviour on systems that do.
if (!vma_is_anonymous(args->vma))
return NV_WARN_NOTHING_TO_DO;
ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
if (ret < 0)
return errno_to_nv_status(ret);
@@ -693,24 +685,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
if (ret < 0)
return errno_to_nv_status(ret);
// TODO: Bug 2419180: support file-backed pages in migrate_vma, when
// support for it is added to the Linux kernel
//
// A side-effect of migrate_vma_setup() is it calls mmu notifiers even if a
// page can't be migrated (eg. because it's a non-anonymous mapping). We
// need this side-effect for SMMU on GH180 to ensure any cached read-only
// entries are flushed from SMMU on permission upgrade.
//
// TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
//
// The above WAR doesn't work for HugeTLBfs mappings because
// migrate_vma_setup() will fail in that case.
if (!vma_is_anonymous(args->vma)) {
migrate_vma_finalize(args);
return NV_WARN_NOTHING_TO_DO;
}
uvm_migrate_vma_alloc_and_copy(args, state);
if (state->status == NV_OK) {
migrate_vma_pages(args);
@@ -862,6 +836,17 @@ static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma,
return NV_OK;
}
NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
{
uvm_va_space_t *va_space = uvm_va_space_get(filp);
uvm_va_space_down_write(va_space);
va_space->test.skip_migrate_vma = params->skip;
uvm_va_space_up_write(va_space);
return NV_OK;
}
static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
unsigned long start,
unsigned long outer,
@@ -884,13 +869,12 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
start = max(start, vma->vm_start);
outer = min(outer, vma->vm_end);
// migrate_vma only supports anonymous VMAs. We check for those after
// calling migrate_vma_setup() to workaround Bug 4130089. We need to check
// for HugeTLB VMAs here because migrate_vma_setup() will return a fatal
// error for those.
// TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
// invalidates on read-only to read-write upgrades
if (is_vm_hugetlb_page(vma))
if (va_space->test.skip_migrate_vma)
return NV_WARN_NOTHING_TO_DO;
// TODO: Bug 2419180: support file-backed pages in migrate_vma, when
// support for it is added to the Linux kernel
if (!vma_is_anonymous(vma))
return NV_WARN_NOTHING_TO_DO;
if (uvm_processor_mask_empty(&va_space->registered_gpus))
@@ -950,7 +934,9 @@ static NV_STATUS migrate_pageable(migrate_vma_state_t *state)
bool touch = uvm_migrate_args->touch;
uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions;
UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus));
UVM_ASSERT(va_space->test.skip_migrate_vma ||
!vma_is_anonymous(vma) ||
uvm_processor_mask_empty(&va_space->registered_gpus));
// We can't use migrate_vma to move the pages as desired. Normally
// this fallback path is supposed to populate the memory then inform

View File

@@ -51,7 +51,7 @@ typedef struct
#if defined(CONFIG_MIGRATE_VMA_HELPER)
#define UVM_MIGRATE_VMA_SUPPORTED 1
#else
#if defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_VMA_SETUP_PRESENT)
#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
#define UVM_MIGRATE_VMA_SUPPORTED 1
#endif
#endif
@@ -218,6 +218,9 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args);
NV_STATUS uvm_migrate_pageable_init(void);
void uvm_migrate_pageable_exit(void);
NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp);
#else // UVM_MIGRATE_VMA_SUPPORTED
static NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
@@ -251,6 +254,10 @@ static void uvm_migrate_pageable_exit(void)
{
}
static inline NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
{
return NV_OK;
}
#endif // UVM_MIGRATE_VMA_SUPPORTED
#endif

View File

@@ -323,37 +323,156 @@ static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
}
static void pde_fill_cpu(uvm_page_tree_t *tree,
uvm_page_directory_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr)
{
NvU64 pde_data[2], entry_size;
NvU32 i;
UVM_ASSERT(uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(directory->depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
for (i = 0; i < pde_count; i++) {
tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i);
if (entry_size == sizeof(pde_data[0]))
uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
else
uvm_mmu_page_table_cpu_memset_16(tree->gpu, &directory->phys_alloc, start_index + i, pde_data, 1);
}
}
static void pde_fill_gpu(uvm_page_tree_t *tree,
uvm_page_directory_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
NvU64 pde_data[2], entry_size;
uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->phys_alloc.addr);
NvU32 max_inline_entries;
uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
uvm_gpu_address_t inline_data_addr;
uvm_push_inline_data_t inline_data;
NvU32 entry_count, i, j;
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(directory->depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / entry_size;
if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
pde_entry_addr.address += start_index * entry_size;
for (i = 0; i < pde_count;) {
// All but the first memory operation can be pipelined. We respect the
// caller's pipelining settings for the first push.
if (i != 0)
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
entry_count = min(pde_count - i, max_inline_entries);
// No membar is needed until the last memory operation. Otherwise,
// use caller's membar flag.
if ((i + entry_count) < pde_count)
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
uvm_push_set_flag(push, push_membar_flag);
uvm_push_inline_data_begin(push, &inline_data);
for (j = 0; j < entry_count; j++) {
tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i + j);
uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
}
inline_data_addr = uvm_push_inline_data_end(&inline_data);
tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * entry_size);
i += entry_count;
pde_entry_addr.address += entry_size * entry_count;
}
}
// pde_fill() populates pde_count PDE entries (starting at start_index) with
// the same mapping, i.e., with the same physical address (phys_addr).
// pde_fill() is optimized for pde_count == 1, which is the common case.
static void pde_fill(uvm_page_tree_t *tree,
uvm_page_directory_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, directory->depth, UVM_PAGE_SIZE_AGNOSTIC));
if (push)
pde_fill_gpu(tree, directory, start_index, pde_count, phys_addr, push);
else
pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
}
static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
{
NvU64 clear_bits[2];
uvm_mmu_mode_hal_t *hal = tree->hal;
NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
NvU8 max_pde_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC) - 1;
if (dir->depth == tree->hal->page_table_depth(page_size)) {
*clear_bits = 0; // Invalid PTE
}
else {
// passing in NULL for the phys_allocs will mark the child entries as invalid
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
hal->make_pde(clear_bits, phys_allocs, dir->depth);
// Passing in NULL for the phys_allocs will mark the child entries as
// invalid.
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
// Make sure that using only clear_bits[0] will work
UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
}
// Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
// than 512 entries. In this case, we initialize them all with the same
// clean PDE. ATS systems may require clean PDEs with
// ATS_ALLOWED/ATS_NOT_ALLOWED bit settings based on the mapping VA.
// We only clean_bits to 0 at the lowest page table level (PTE table), i.e.,
// when depth is greater than the max_pde_depth.
if ((dir->depth > max_pde_depth) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
NvU64 clear_bits[2];
// initialize the memory to a reasonable value
if (push) {
tree->gpu->parent->ce_hal->memset_8(push,
uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
// If it is not a PTE, make a clean PDE.
if (dir->depth != tree->hal->page_table_depth(page_size)) {
// make_pde() child index is zero/ignored, since it is only used in
// PDEs on ATS-enabled systems where pde_fill() is preferred.
tree->hal->make_pde(clear_bits, phys_allocs, dir, 0);
// Make sure that using only clear_bits[0] will work.
UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
}
else {
*clear_bits = 0;
}
// Initialize the memory to a reasonable value.
if (push) {
tree->gpu->parent->ce_hal->memset_8(push,
uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
*clear_bits,
dir->phys_alloc.size);
}
else {
uvm_mmu_page_table_cpu_memset_8(tree->gpu,
&dir->phys_alloc,
0,
*clear_bits,
dir->phys_alloc.size);
dir->phys_alloc.size / sizeof(*clear_bits));
}
}
else {
uvm_mmu_page_table_cpu_memset_8(tree->gpu,
&dir->phys_alloc,
0,
*clear_bits,
dir->phys_alloc.size / sizeof(*clear_bits));
pde_fill(tree, dir, 0, entries_count, phys_allocs, push);
}
}
static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@@ -367,8 +486,10 @@ static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
uvm_page_directory_t *dir;
// The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
// 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
// The page tree doesn't cache PTEs so space is not allocated for entries
// that are always PTEs.
// 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not
// page_size.
if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
entry_count = 0;
else
@@ -409,108 +530,6 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
}
static void pde_fill_cpu(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr)
{
NvU64 pde_data[2], entry_size;
UVM_ASSERT(uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
tree->hal->make_pde(pde_data, phys_addr, depth);
if (entry_size == sizeof(pde_data[0]))
uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
else
uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
}
static void pde_fill_gpu(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
NvU64 pde_data[2], entry_size;
uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
entry_size = tree->hal->entry_size(depth);
UVM_ASSERT(sizeof(pde_data) >= entry_size);
tree->hal->make_pde(pde_data, phys_addr, depth);
pde_entry_addr.address += start_index * entry_size;
if (entry_size == sizeof(pde_data[0])) {
tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
}
else {
NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
uvm_gpu_address_t inline_data_addr;
uvm_push_inline_data_t inline_data;
NvU32 membar_flag = 0;
NvU32 i;
if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
for (i = 0; i < pde_count;) {
NvU32 j;
NvU32 entry_count = min(pde_count - i, max_inline_entries);
uvm_push_inline_data_begin(push, &inline_data);
for (j = 0; j < entry_count; j++)
uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
inline_data_addr = uvm_push_inline_data_end(&inline_data);
// All but the first memcopy can be pipelined. We respect the
// caller's pipelining settings for the first push.
if (i != 0)
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
// No membar is needed until the last copy. Otherwise, use
// caller's membar flag.
if (i + entry_count < pde_count)
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
else if (membar_flag)
uvm_push_set_flag(push, membar_flag);
tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
i += entry_count;
pde_entry_addr.address += sizeof(pde_data) * entry_count;
}
}
}
// pde_fill() populates pde_count PDE entries (starting at start_index) with
// the same mapping, i.e., with the same physical address (phys_addr).
static void pde_fill(uvm_page_tree_t *tree,
NvU32 depth,
uvm_mmu_page_table_alloc_t *directory,
NvU32 start_index,
NvU32 pde_count,
uvm_mmu_page_table_alloc_t **phys_addr,
uvm_push_t *push)
{
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
if (push)
pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
else
pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
}
static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
uvm_page_directory_t *parent,
NvU32 index_in_parent)
@@ -540,7 +559,7 @@ static void pde_write(uvm_page_tree_t *tree,
phys_allocs[i] = &entry->phys_alloc;
}
pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
}
static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
@@ -800,7 +819,6 @@ static void free_unused_directories(uvm_page_tree_t *tree,
}
}
}
}
static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
@@ -811,10 +829,93 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm
return phys_mem_allocate(tree, alloc_size, tree->location, UVM_PMM_ALLOC_FLAGS_EVICT, out);
}
static bool page_tree_ats_init_required(uvm_page_tree_t *tree)
{
// We have full control of the kernel page tables mappings, no ATS address
// aliases is expected.
if (tree->type == UVM_PAGE_TREE_TYPE_KERNEL)
return false;
// Enable uvm_page_tree_init() from the page_tree test.
if (uvm_enable_builtin_tests && tree->gpu_va_space == NULL)
return false;
if (!tree->gpu_va_space->ats.enabled)
return false;
return tree->gpu->parent->no_ats_range_required;
}
static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
{
NV_STATUS status;
NvU64 min_va_upper, max_va_lower;
NvU32 page_size;
if (!page_tree_ats_init_required(tree))
return NV_OK;
page_size = uvm_mmu_biggest_page_size(tree);
uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
// Potential violation of the UVM internal get/put_ptes contract. get_ptes()
// creates and initializes enough PTEs to populate all PDEs covering the
// no_ats_ranges. We store the no_ats_ranges in the tree, so they can be
// put_ptes()'ed on deinit(). It doesn't preclude the range to be used by a
// future get_ptes(), since we don't write to the PTEs (range->table) from
// the tree->no_ats_ranges.
//
// Lower half
status = uvm_page_tree_get_ptes(tree,
page_size,
max_va_lower,
page_size,
UVM_PMM_ALLOC_FLAGS_EVICT,
&tree->no_ats_ranges[0]);
if (status != NV_OK)
return status;
UVM_ASSERT(tree->no_ats_ranges[0].entry_count == 1);
if (uvm_platform_uses_canonical_form_address()) {
// Upper half
status = uvm_page_tree_get_ptes(tree,
page_size,
min_va_upper - page_size,
page_size,
UVM_PMM_ALLOC_FLAGS_EVICT,
&tree->no_ats_ranges[1]);
if (status != NV_OK)
return status;
UVM_ASSERT(tree->no_ats_ranges[1].entry_count == 1);
}
return NV_OK;
}
static void page_tree_ats_deinit(uvm_page_tree_t *tree)
{
size_t i;
if (page_tree_ats_init_required(tree)) {
for (i = 0; i < ARRAY_SIZE(tree->no_ats_ranges); i++) {
if (tree->no_ats_ranges[i].entry_count)
uvm_page_tree_put_ptes(tree, &tree->no_ats_ranges[i]);
}
memset(tree->no_ats_ranges, 0, sizeof(tree->no_ats_ranges));
}
}
static void map_remap_deinit(uvm_page_tree_t *tree)
{
if (tree->map_remap.pde0.size)
phys_mem_deallocate(tree, &tree->map_remap.pde0);
if (tree->map_remap.pde0) {
phys_mem_deallocate(tree, &tree->map_remap.pde0->phys_alloc);
uvm_kvfree(tree->map_remap.pde0);
tree->map_remap.pde0 = NULL;
}
if (tree->map_remap.ptes_invalid_4k.size)
phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
@@ -839,10 +940,16 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
// PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
// return the PTE for the get_ptes()'s caller.
if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
if (status != NV_OK)
tree->map_remap.pde0 = allocate_directory(tree,
UVM_PAGE_SIZE_2M,
tree->hal->page_table_depth(UVM_PAGE_SIZE_2M),
UVM_PMM_ALLOC_FLAGS_EVICT);
if (tree->map_remap.pde0 == NULL) {
status = NV_ERR_NO_MEMORY;
goto error;
}
}
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
if (status != NV_OK)
goto error;
@@ -864,22 +971,23 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
// pde0 depth equals UVM_PAGE_SIZE_2M.
NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);
NvU32 pde0_entries = tree->map_remap.pde0->phys_alloc.size / tree->hal->entry_size(tree->map_remap.pde0->depth);
// The big-page entry is NULL which makes it an invalid entry.
phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;
// By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
// sufficient when pde0 is allocated in VIDMEM.
if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
if (tree->map_remap.pde0->phys_alloc.addr.aperture == UVM_APERTURE_VID)
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
// This is an orphan directory, make_pde() requires a directory to
// compute the VA. The UVM depth map_remap() operates on is not in the
// range make_pde() must operate. We only need to supply the fields used
// by make_pde() to not access invalid memory addresses.
pde_fill(tree,
pde0_depth,
&tree->map_remap.pde0,
tree->map_remap.pde0,
0,
pde0_entries,
(uvm_mmu_page_table_alloc_t **)&phys_allocs,
@@ -1006,11 +1114,22 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
return status;
phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, &push);
return page_tree_end_and_wait(tree, &push);
status = page_tree_end_and_wait(tree, &push);
if (status != NV_OK)
return status;
status = page_tree_ats_init(tree);
if (status != NV_OK)
return status;
return NV_OK;
}
void uvm_page_tree_deinit(uvm_page_tree_t *tree)
{
page_tree_ats_deinit(tree);
UVM_ASSERT(tree->root->ref_count == 0);
// Take the tree lock only to avoid assertions. It is not required for
@@ -1249,7 +1368,6 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
UVM_ASSERT(uvm_gpu_can_address_kernel(tree->gpu, start, size));
while (true) {
// index of the entry, for the first byte of the range, within its
// containing directory
NvU32 start_index;
@@ -1281,7 +1399,8 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
if (dir_cache[dir->depth] == NULL) {
*cur_depth = dir->depth;
// Undo the changes to the tree so that the dir cache remains private to the thread
// Undo the changes to the tree so that the dir cache
// remains private to the thread.
for (i = 0; i < used_count; i++)
host_pde_clear(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, page_size);
@@ -1332,10 +1451,9 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
phys_alloc[0] = &tree->map_remap.pde0;
phys_alloc[0] = &tree->map_remap.pde0->phys_alloc;
pde_fill(tree,
range->table->depth,
&range->table->phys_alloc,
range->table,
range->start_index,
range->entry_count,
(uvm_mmu_page_table_alloc_t **)&phys_alloc,
@@ -1380,7 +1498,8 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
dir_cache)) == NV_ERR_MORE_PROCESSING_REQUIRED) {
uvm_mutex_unlock(&tree->lock);
// try_get_ptes never needs depth 0, so store a directory at its parent's depth
// try_get_ptes never needs depth 0, so store a directory at its
// parent's depth.
// TODO: Bug 1766655: Allocate everything below cur_depth instead of
// retrying for every level.
dir_cache[cur_depth] = allocate_directory(tree, page_size, cur_depth + 1, pmm_flags);
@@ -1663,8 +1782,12 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
range);
if (status != NV_OK) {
UVM_ERR_PRINT("Failed to get PTEs for subrange %zd [0x%llx, 0x%llx) size 0x%llx, part of [0x%llx, 0x%llx)\n",
i, range_start, range_start + range_size, range_size,
start, size);
i,
range_start,
range_start + range_size,
range_size,
start,
size);
goto out;
}
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2022 NVIDIA Corporation
Copyright (c) 2015-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -215,11 +215,14 @@ struct uvm_mmu_mode_hal_struct
// memory out-of-range error so we can immediately identify bad PTE usage.
NvU64 (*poisoned_pte)(void);
// write a PDE bit-pattern to entry based on the data in entries (which may
// Write a PDE bit-pattern to entry based on the data in allocs (which may
// point to two items for dual PDEs).
// any of allocs are allowed to be NULL, in which case they are to be
// treated as empty.
void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth);
// Any of allocs are allowed to be NULL, in which case they are to be
// treated as empty. make_pde() uses dir and child_index to compute the
// mapping PDE VA. On ATS-enabled systems, we may set PDE's PCF as
// ATS_ALLOWED or ATS_NOT_ALLOWED based on the mapping PDE VA, even for
// invalid/clean PDE entries.
void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, uvm_page_directory_t *dir, NvU32 child_index);
// size of an entry in a directory/table. Generally either 8 or 16 bytes.
// (in the case of Pascal dual PDEs)
@@ -229,7 +232,7 @@ struct uvm_mmu_mode_hal_struct
NvU32 (*entries_per_index)(NvU32 depth);
// For dual PDEs, this is ether 1 or 0, depending on the page size.
// This is used to index the host copy only. GPU PDEs are always entirely
// This is used to index the host copy only. GPU PDEs are always entirely
// re-written using make_pde.
NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);
@@ -295,11 +298,16 @@ struct uvm_page_tree_struct
// PDE0 where all big-page entries are invalid, and small-page entries
// point to ptes_invalid_4k.
// pde0 is only used on Pascal-Ampere, i.e., they have the same PDE
// format.
uvm_mmu_page_table_alloc_t pde0;
// pde0 is used on Pascal+ GPUs, i.e., they have the same PDE format.
uvm_page_directory_t *pde0;
} map_remap;
// On ATS-enabled systems where the CPU VA width is smaller than the GPU VA
// width, the excess address range is set with ATS_NOT_ALLOWED on all leaf
// PDEs covering that range. We have at most 2 no_ats_ranges, due to
// canonical form address systems.
uvm_page_table_range_t no_ats_ranges[2];
// Tracker for all GPU operations on the tree
uvm_tracker_t tracker;
};
@@ -365,21 +373,32 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
// the same page size without an intervening put_ptes. To duplicate a subset of
// an existing range or change the size of an existing range, use
// uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
NvU32 page_size,
NvU64 start,
NvLength size,
uvm_pmm_alloc_flags_t pmm_flags,
uvm_page_table_range_t *range);
// Same as uvm_page_tree_get_ptes(), but doesn't synchronize the GPU work.
//
// All pending operations can be waited on with uvm_page_tree_wait().
NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
NvU32 page_size,
NvU64 start,
NvLength size,
uvm_pmm_alloc_flags_t pmm_flags,
uvm_page_table_range_t *range);
// Returns a single-entry page table range for the addresses passed.
// The size parameter must be a page size supported by this tree.
// This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
// page_size.
NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start,
uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *single);
NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
NvU32 page_size,
NvU64 start,
uvm_pmm_alloc_flags_t pmm_flags,
uvm_page_table_range_t *single);
// For a single-entry page table range, write the PDE (which could be a dual
// PDE) to the GPU.
@@ -478,8 +497,8 @@ NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
// new_range_vec will contain the upper portion of range_vec, starting at
// new_end + 1.
//
// new_end + 1 is required to be within the address range of range_vec and be aligned to
// range_vec's page_size.
// new_end + 1 is required to be within the address range of range_vec and be
// aligned to range_vec's page_size.
//
// On failure, the original range vector is left unmodified.
NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
@@ -501,18 +520,22 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec);
// for each offset.
// The caller_data pointer is what the caller passed in as caller_data to
// uvm_page_table_range_vec_write_ptes().
typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec, NvU64 offset,
void *caller_data);
typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec,
NvU64 offset,
void *caller_data);
// Write all PTEs covered by the range vector using the given PTE making function.
// Write all PTEs covered by the range vector using the given PTE making
// function.
//
// After writing all the PTEs a TLB invalidate operation is performed including
// the passed in tlb_membar.
//
// See comments about uvm_page_table_range_pte_maker_t for details about the
// PTE making callback.
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker, void *caller_data);
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
uvm_membar_t tlb_membar,
uvm_page_table_range_pte_maker_t pte_maker,
void *caller_data);
// Set all PTEs covered by the range vector to an empty PTE
//
@@ -636,8 +659,9 @@ static NvU64 uvm_page_table_range_size(uvm_page_table_range_t *range)
// Get the physical address of the entry at entry_index within the range
// (counted from range->start_index).
static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree, uvm_page_table_range_t *range,
size_t entry_index)
static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree,
uvm_page_table_range_t *range,
size_t entry_index)
{
NvU32 entry_size = uvm_mmu_pte_size(tree, range->page_size);
uvm_gpu_phys_address_t entry = range->table->phys_alloc.addr;

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2022 NVIDIA Corporation
Copyright (c) 2015-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -146,9 +146,15 @@ static void fake_tlb_invals_disable(void)
g_fake_tlb_invals_tracking_enabled = false;
}
// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
// Fake TLB invalidate VA that just saves off the parameters so that they can be
// verified later.
static void fake_tlb_invalidate_va(uvm_push_t *push,
uvm_gpu_phys_address_t pdb,
NvU32 depth,
NvU64 base,
NvU64 size,
NvU32 page_size,
uvm_membar_t membar)
{
if (!g_fake_tlb_invals_tracking_enabled)
return;
@@ -210,8 +216,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
}
if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
expected_membar ? "a" : "no",
uvm_membar_string(g_last_fake_inval->membar));
expected_membar ? "a" : "no",
uvm_membar_string(g_last_fake_inval->membar));
result = false;
}
@@ -230,7 +236,8 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
}
if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
g_last_fake_inval->base,
g_last_fake_inval->base + g_last_fake_inval->size);
return false;
}
if (g_last_fake_inval->depth != expected_depth) {
@@ -247,15 +254,16 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);
if (g_fake_invals_count == 0) {
UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
base, base + size);
UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
return false;
}
if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
base, base + size,
inval->base, inval->base + inval->size);
base,
base + size,
inval->base,
inval->base + inval->size);
return false;
}
if (inval->depth != expected_depth) {
@@ -270,7 +278,13 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
return true;
}
static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
static bool assert_invalidate_range(NvU64 base,
NvU64 size,
NvU32 page_size,
bool allow_inval_all,
NvU32 range_depth,
NvU32 all_depth,
bool expected_membar)
{
NvU32 i;
@@ -488,7 +502,6 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
return NV_OK;
}
static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
{
uvm_page_tree_t tree;
@@ -842,6 +855,7 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
TEST_CHECK_RET(range2.entry_count == 256);
TEST_CHECK_RET(range2.table->ref_count == 512);
TEST_CHECK_RET(range1.table == range2.table);
// 4k page is second entry in a dual PDE
TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
TEST_CHECK_RET(range1.start_index == 256);
@@ -871,6 +885,7 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
TEST_CHECK_RET(range64k.entry_count == 16);
TEST_CHECK_RET(range64k.table->ref_count == 16);
// 4k page is second entry in a dual PDE
TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
TEST_CHECK_RET(range64k.start_index == 16);
@@ -1030,10 +1045,13 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
// Depth 4
NvU64 extent_pte = UVM_PAGE_SIZE_2M;
// Depth 3
NvU64 extent_pde0 = extent_pte * (1ull << 8);
// Depth 2
NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
// Depth 1
NvU64 extent_pde2 = extent_pde1 * (1ull << 9);
@@ -1081,7 +1099,11 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
return status;
}
static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
NvU64 base,
NvU64 size,
NvU32 min_page_size,
NvU32 max_page_size)
{
NV_STATUS status = NV_OK;
uvm_push_t push;
@@ -1205,7 +1227,11 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
if (*pte != expected_pte) {
UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
*pte, expected_pte, offset, range_vec->start, range_vec->size);
*pte,
expected_pte,
offset,
range_vec->start,
range_vec->size);
return false;
}
offset += range_vec->page_size;
@@ -1226,7 +1252,11 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
TEST_CHECK_RET(data.status == NV_OK);
TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
range_vec->start,
range_vec->size,
range_vec->page_size,
page_table_depth,
membar != UVM_MEMBAR_NONE));
TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));
fake_tlb_invals_disable();
@@ -1249,7 +1279,11 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
return NV_OK;
}
static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
NvU64 start,
NvU64 size,
NvU32 page_size,
uvm_page_table_range_vec_t **range_vec_out)
{
uvm_page_table_range_vec_t *range_vec;
uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@@ -1544,25 +1578,28 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
uvm_mmu_mode_hal_t *hal;
uvm_page_directory_t dir;
NvU32 i, j, big_page_size, page_size;
dir.depth = 0;
for (i = 0; i < ARRAY_SIZE(big_page_sizes); i++) {
big_page_size = big_page_sizes[i];
hal = gpu->parent->arch_hal->mmu_mode_hal(big_page_size);
memset(phys_allocs, 0, sizeof(phys_allocs));
hal->make_pde(&pde_bits, phys_allocs, 0);
hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits == 0x0L);
phys_allocs[0] = &alloc_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(&pde_bits, phys_allocs, 0);
hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);
phys_allocs[0] = &alloc_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(&pde_bits, phys_allocs, 0);
hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);
for (j = 0; j <= 2; j++) {
@@ -1632,38 +1669,47 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
uvm_page_directory_t dir;
// big versions have [11:8] set as well to test the page table merging
uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
dir.index_in_parent = 0;
dir.host_parent = NULL;
dir.depth = 0;
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 3);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
// Sys and vidmem PDEs
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0);
dir.depth = 0;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);
// Dual PDEs
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 3);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 3);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);
// uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@@ -1719,6 +1765,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
uvm_page_directory_t dir;
// big versions have [11:8] set as well to test the page table merging
uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
@@ -1726,37 +1773,45 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
dir.index_in_parent = 0;
dir.host_parent = NULL;
dir.depth = 0;
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 3);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
// Sys and vidmem PDEs
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0);
dir.depth = 0;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);
// Dual PDEs
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 3);
dir.depth = 3;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 3);
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);
// NO_ATS PDE1 (depth 2)
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 2);
dir.depth = 2;
hal->make_pde(pde_bits, phys_allocs, &dir, 0);
if (g_uvm_global.ats.enabled)
TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
else
@@ -1791,104 +1846,203 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent
static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
{
NV_STATUS status = NV_OK;
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
NvU64 pde_bits[2];
uvm_page_directory_t *dirs[5];
size_t i, num_page_sizes;
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999000LL);
uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBB000LL);
// big versions have [11:8] set as well to test the page table merging
// Big versions have [11:8] set as well to test the page table merging
uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999900LL);
uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBBB00LL);
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
// Make sure cleared PDEs work as expected
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0);
memset(dirs, 0, sizeof(dirs));
// Fake directory tree.
for (i = 0; i < ARRAY_SIZE(dirs); i++) {
dirs[i] = uvm_kvmalloc_zero(sizeof(uvm_page_directory_t) + sizeof(dirs[i]->entries[0]) * 512);
TEST_CHECK_GOTO(dirs[i] != NULL, cleanup);
dirs[i]->depth = i;
dirs[i]->index_in_parent = 0;
if (i == 0)
dirs[i]->host_parent = NULL;
else
dirs[i]->host_parent = dirs[i - 1];
}
// Make sure cleared PDEs work as expected.
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0, cleanup);
// Cleared PDEs work as expected for big and small PDEs.
memset(pde_bits, 0xFF, sizeof(pde_bits));
hal->make_pde(pde_bits, phys_allocs, 4);
TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0 && pde_bits[1] == 0, cleanup);
// Sys and vidmem PDEs, uncached ATS allowed.
phys_allocs[0] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
phys_allocs[0] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 0);
TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBB00A, cleanup);
// Dual PDEs, uncached.
// Dual PDEs, uncached. We don't use child_dir in the depth 4 checks because
// our policy decides the PDE's PCF without using it.
phys_allocs[0] = &alloc_big_sys;
phys_allocs[1] = &alloc_vid;
hal->make_pde(pde_bits, phys_allocs, 4);
TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);
hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
if (g_uvm_global.ats.enabled)
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A, cleanup);
else
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999990C && pde_bits[1] == 0xBBBBBBB00A, cleanup);
phys_allocs[0] = &alloc_big_vid;
phys_allocs[1] = &alloc_sys;
hal->make_pde(pde_bits, phys_allocs, 4);
TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);
hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
if (g_uvm_global.ats.enabled)
TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C, cleanup);
else
TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB0A && pde_bits[1] == 0x999999999900C, cleanup);
// We only need to test make_pde() on ATS when the CPU VA width < GPU's.
if (g_uvm_global.ats.enabled && uvm_cpu_num_va_bits() < hal->num_va_bits()) {
phys_allocs[0] = &alloc_sys;
dirs[1]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
dirs[2]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 1;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 1);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 2;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 511;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 511);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[1]->index_in_parent = 1;
hal->make_pde(pde_bits, phys_allocs, dirs[0], 1);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
dirs[2]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 509;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
dirs[2]->index_in_parent = 510;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
phys_allocs[0] = NULL;
dirs[1]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
dirs[2]->index_in_parent = 0;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
dirs[2]->index_in_parent = 2;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
dirs[1]->index_in_parent = 1;
dirs[2]->index_in_parent = 509;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
dirs[2]->index_in_parent = 510;
hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
}
// uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
// access counters disabled.
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D,
cleanup);
// change to cached.
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
0x9999999999685);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
0x9999999999685,
cleanup);
// enable access counters.
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE_ATOMIC,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605,
cleanup);
// remove atomic
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_WRITE,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645,
cleanup);
// read only
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
0x9999999999000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665,
cleanup);
// local video
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_VID,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_VID,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661,
cleanup);
// peer 1
TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_1,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663);
TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_PEER_1,
0xBBBBBBB000LL,
UVM_PROT_READ_ONLY,
UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663,
cleanup);
// sparse
TEST_CHECK_RET(hal->make_sparse_pte() == 0x8);
TEST_CHECK_GOTO(hal->make_sparse_pte() == 0x8, cleanup);
// sked reflected
TEST_CHECK_RET(hal->make_sked_reflected_pte() == 0xF09);
TEST_CHECK_GOTO(hal->make_sked_reflected_pte() == 0xF09, cleanup);
num_page_sizes = get_page_sizes(gpu, page_sizes);
for (i = 0; i < num_page_sizes; i++)
TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));
TEST_NV_CHECK_GOTO(entry_test_page_size(gpu, page_sizes[i]), cleanup);
return NV_OK;
cleanup:
for (i = 0; i < ARRAY_SIZE(dirs); i++)
uvm_kvfree(dirs[i]);
return status;
}
static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
@@ -2303,7 +2457,8 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
gpu->parent = parent_gpu;
// At least test_tlb_invalidates() relies on global state
// (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
// (g_tlb_invalidate_*) so make sure only one test instance can run at a
// time.
uvm_mutex_lock(&g_uvm_global.global_lock);
// Allocate the fake TLB tracking state. Notably tests still need to enable
@@ -2311,7 +2466,13 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
// calls.
TEST_NV_CHECK_GOTO(fake_tlb_invals_alloc(), done);
TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
// We prevent the maxwell_test_page_tree test from running on ATS-enabled
// systems. On "fake" Maxwell-based ATS systems pde_fill() may push more
// methods than what we support in UVM. Specifically, on
// uvm_page_tree_init() which eventually calls phys_mem_init(). On Maxwell,
// upper PDE levels have more than 512 entries.
if (!g_uvm_global.ats.enabled)
TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
TEST_NV_CHECK_GOTO(pascal_test_page_tree(gpu), done);
TEST_NV_CHECK_GOTO(volta_test_page_tree(gpu), done);
TEST_NV_CHECK_GOTO(ampere_test_page_tree(gpu), done);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2020 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -100,4 +100,6 @@ void uvm_hal_pascal_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = false;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2015-2020 NVIDIA Corporation
Copyright (c) 2015-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -140,11 +140,18 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
return pde_bits;
}
static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
static void make_pde_pascal(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU32 entry_count = entries_per_index_pascal(depth);
NvU32 entry_count;
NvU64 *entry_bits = (NvU64 *)entry;
UVM_ASSERT(dir);
entry_count = entries_per_index_pascal(dir->depth);
if (entry_count == 1) {
*entry_bits = single_pde_pascal(*phys_allocs);
}
@@ -152,7 +159,8 @@ static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_alloc
entry_bits[MMU_BIG] = big_half_pde_pascal(phys_allocs[MMU_BIG]);
entry_bits[MMU_SMALL] = small_half_pde_pascal(phys_allocs[MMU_SMALL]);
// This entry applies to the whole dual PDE but is stored in the lower bits
// This entry applies to the whole dual PDE but is stored in the lower
// bits.
entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER2, DUAL_PDE, IS_PDE, TRUE);
}
else {

View File

@@ -36,6 +36,7 @@
#include "uvm_mmu.h"
#include "uvm_gpu_access_counters.h"
#include "uvm_pmm_sysmem.h"
#include "uvm_migrate_pageable.h"
static NV_STATUS uvm_test_get_gpu_ref_count(UVM_TEST_GET_GPU_REF_COUNT_PARAMS *params, struct file *filp)
{
@@ -331,6 +332,7 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED, uvm_test_cgroup_accounting_supported);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SPLIT_INVALIDATE_DELAY, uvm_test_split_invalidate_delay);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CPU_CHUNK_API, uvm_test_cpu_chunk_api);
UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SKIP_MIGRATE_VMA, uvm_test_skip_migrate_vma);
}
return -EINVAL;

View File

@@ -28,6 +28,13 @@
#include "uvm_ioctl.h"
#include "nv_uvm_types.h"
#define UVM_TEST_SKIP_MIGRATE_VMA UVM_TEST_IOCTL_BASE(103)
typedef struct
{
NvBool skip; // In
NV_STATUS rmStatus; // Out
} UVM_TEST_SKIP_MIGRATE_VMA_PARAMS;
#ifdef __cplusplus
extern "C" {
#endif

View File

@@ -1082,25 +1082,19 @@ void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
}
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
NvU32 batch_id,
uvm_fault_client_type_t client_type)
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type)
{
UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);
if (!tools_is_event_enabled_in_any_va_space(UvmEventTypeGpuFaultReplay))
return;
record_replay_event_helper(gpu->id,
batch_id,
client_type,
NV_GETTIME(),
gpu->parent->host_hal->get_time(gpu));
record_replay_event_helper(gpu->id, batch_id, client_type, NV_GETTIME(), gpu->parent->host_hal->get_time(gpu));
}
void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *buffer_entry,
bool on_managed)
bool on_managed_phys)
{
UvmEventEntry entry;
UvmEventTestAccessCounterInfo *info = &entry.testEventData.accessCounter;
@@ -1119,6 +1113,7 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
info->srcIndex = uvm_id_value(gpu->id);
info->address = buffer_entry->address.address;
info->isVirtual = buffer_entry->address.is_virtual? 1: 0;
if (buffer_entry->address.is_virtual) {
info->instancePtr = buffer_entry->virtual_info.instance_ptr.address;
info->instancePtrAperture = g_hal_to_tools_aperture_table[buffer_entry->virtual_info.instance_ptr.aperture];
@@ -1126,9 +1121,10 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
}
else {
info->aperture = g_hal_to_tools_aperture_table[buffer_entry->address.aperture];
info->physOnManaged = on_managed_phys? 1 : 0;
}
info->isFromCpu = buffer_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MOMC? 1: 0;
info->onManaged = on_managed? 1 : 0;
info->value = buffer_entry->counter_value;
info->subGranularity = buffer_entry->sub_granularity;
info->bank = buffer_entry->bank;

View File

@@ -102,18 +102,13 @@ void uvm_tools_record_read_duplicate_invalidate(uvm_va_block_t *va_block,
uvm_va_block_region_t region,
const uvm_page_mask_t *page_mask);
void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
uvm_push_t *push,
NvU32 batch_id,
uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_replay(uvm_gpu_t *gpu, uvm_push_t *push, NvU32 batch_id, uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
NvU32 batch_id,
uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type);
void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
const uvm_access_counter_buffer_entry_t *buffer_entry,
bool on_managed);
bool on_managed_phys);
void uvm_tools_test_hmm_split_invalidate(uvm_va_space_t *va_space);

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2017-2021 NVIDIA Corporation
Copyright (c) 2017-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -93,4 +93,6 @@ void uvm_hal_turing_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = true;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -967,8 +967,10 @@ typedef struct
NvU8 isFromCpu;
NvU8 veId;
NvU8 onManaged; // The access counter notification was triggered on
// a managed memory region
// The physical access counter notification was triggered on a managed
// memory region. This is not set for virtual access counter notifications.
NvU8 physOnManaged;
NvU32 value;
NvU32 subGranularity;

View File

@@ -1760,6 +1760,21 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
return (NvU32)chunk_size;
}
NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
uvm_processor_id_t processor,
uvm_page_index_t page_index)
{
block_phys_page_t page;
UVM_ASSERT(block);
uvm_assert_mutex_locked(&block->lock);
page = block_phys_page(processor, page_index);
return block_phys_page_size(block, page);
}
static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
{
uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
@@ -8248,14 +8263,6 @@ void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
event_data.block_munmap.region = region;
uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);
// Set a flag so that GPU fault events are flushed since they might refer
// to the region being unmapped.
// Note that holding the va_block lock prevents GPU VA spaces from
// being removed so the registered_gpu_va_spaces mask is stable.
for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
}
// Release any remaining vidmem chunks in the given region.
for_each_gpu_id(gpu_id) {
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
@@ -10155,6 +10162,34 @@ static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
return preferred_location;
// Check if we should map the closest resident processor remotely on remote CPU fault
//
// When faulting on CPU, there's a linux process on behalf of it, which is associated
// with a unique VM pointed by current->mm. A block of memory residing on GPU is also
// associated with VM, pointed by va_block_context->mm. If they match, it's a regular
// (local) fault, and we may want to migrate a page from GPU to CPU.
// If it's a 'remote' fault, i.e. linux process differs from one associated with block
// VM, we might preserve residence.
//
// Establishing a remote fault without access counters means the memory could stay in
// the wrong spot for a long time, which is why we prefer to avoid creating remote
// mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
// in place for NIC accesses.
//
// The logic that's used to detect remote faulting also keeps memory in place for
// ptrace accesses. We would prefer to control those policies separately, but the
// NIC case takes priority.
// If the accessing processor is CPU, we're either handling a fault
// from other than owning process, or we're handling an MOMC
// notification. Only prevent migration for the former.
if (UVM_ID_IS_CPU(processor_id) &&
operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
va_block_context->mm != current->mm) {
UVM_ASSERT(va_block_context->mm != NULL);
return closest_resident_processor;
}
// If the page is resident on a processor other than the preferred location,
// or the faulting processor can't access the preferred location, we select
// the faulting processor as the new residency.
@@ -10713,7 +10748,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context,
uvm_processor_id_t processor_id,
uvm_page_index_t page_index,
uvm_fault_type_t access_type,
uvm_fault_access_type_t access_type,
bool allow_migration)
{
uvm_va_range_t *va_range = va_block->va_range;

View File

@@ -1000,7 +1000,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context,
uvm_processor_id_t processor_id,
uvm_page_index_t page_index,
uvm_fault_type_t access_type,
uvm_fault_access_type_t access_type,
bool allow_migration);
// API for access privilege revocation
@@ -2072,6 +2072,14 @@ void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
// Locking: The va_block lock must be held.
void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region);
// Get the size of the physical allocation backing the page at page_index on the
// specified processor in the block. Returns 0 if the address is not resident on
// the specified processor.
// Locking: The va_block lock must be held.
NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
uvm_processor_id_t processor,
uvm_page_index_t page_index);
// Get CPU page size or 0 if it is not mapped
NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
uvm_page_index_t page_index);

View File

@@ -193,7 +193,8 @@ uvm_va_policy_node_t *uvm_va_policy_node_iter_next(uvm_va_block_t *va_block, uvm
for ((node) = uvm_va_policy_node_iter_first((va_block), (start), (end)), \
(next) = uvm_va_policy_node_iter_next((va_block), (node), (end)); \
(node); \
(node) = (next))
(node) = (next), \
(next) = uvm_va_policy_node_iter_next((va_block), (node), (end)))
// Returns the first policy in the range [start, end], if any.
// Locking: The va_block lock must be held.

View File

@@ -1540,7 +1540,6 @@ static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);
uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu_va_space->gpu->id);
uvm_processor_mask_clear_atomic(&va_space->needs_fault_buffer_flush, gpu_va_space->gpu->id);
va_space->gpu_va_spaces[uvm_id_gpu_index(gpu_va_space->gpu->id)] = NULL;
gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
}

View File

@@ -253,17 +253,6 @@ struct uvm_va_space_struct
// corrupting state.
uvm_processor_mask_t gpu_unregister_in_progress;
// On VMA destruction, the fault buffer needs to be flushed for all the GPUs
// registered in the VA space to avoid leaving stale entries of the VA range
// that is going to be destroyed. Otherwise, these fault entries can be
// attributed to new VA ranges reallocated at the same addresses. However,
// uvm_vm_close is called with mm->mmap_lock taken and we cannot take the
// ISR lock. Therefore, we use a flag to notify the GPU fault handler that
// the fault buffer needs to be flushed, before servicing the faults that
// belong to the va_space. The bits are set and cleared atomically so no
// va_space lock is required.
uvm_processor_mask_t needs_fault_buffer_flush;
// Mask of processors that are participating in system-wide atomics
uvm_processor_mask_t system_wide_atomics_enabled_processors;
@@ -353,6 +342,7 @@ struct uvm_va_space_struct
struct
{
bool page_prefetch_enabled;
bool skip_migrate_vma;
atomic_t migrate_vma_allocation_fail_nth;

View File

@@ -215,7 +215,13 @@ bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)
static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
{
#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
.invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
#elif defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
.arch_invalidate_secondary_tlbs = uvm_mmu_notifier_invalidate_range_ats,
#else
#error One of invalidate_range/arch_invalid_secondary must be present
#endif
};
static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2016-2021 NVIDIA Corporation
Copyright (c) 2016-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -98,4 +98,6 @@ void uvm_hal_volta_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
parent_gpu->smc.supported = false;
parent_gpu->plc_supported = false;
parent_gpu->no_ats_range_required = false;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2017-2021 NVIDIA Corporation
Copyright (c) 2017-2023 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -145,13 +145,20 @@ static NvU64 small_half_pde_volta(uvm_mmu_page_table_alloc_t *phys_alloc)
return pde_bits;
}
static void make_pde_volta(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
static void make_pde_volta(void *entry,
uvm_mmu_page_table_alloc_t **phys_allocs,
uvm_page_directory_t *dir,
NvU32 child_index)
{
NvU32 entry_count = entries_per_index_volta(depth);
NvU32 entry_count;
NvU64 *entry_bits = (NvU64 *)entry;
UVM_ASSERT(dir);
entry_count = entries_per_index_volta(dir->depth);
if (entry_count == 1) {
*entry_bits = single_pde_volta(*phys_allocs, depth);
*entry_bits = single_pde_volta(*phys_allocs, dir->depth);
}
else if (entry_count == 2) {
entry_bits[MMU_BIG] = big_half_pde_volta(phys_allocs[MMU_BIG]);

View File

@@ -23,10 +23,16 @@
#include "internal_crypt_lib.h"
#ifdef USE_LKCA
#ifndef NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT
#include <crypto/internal/hash.h>
#endif
#endif
void *lkca_hash_new(const char* alg_name)
{
#ifndef USE_LKCA
return false;
return NULL;
#else
//XXX: can we reuse crypto_shash part and just allocate desc
struct crypto_shash *alg;
@@ -87,9 +93,24 @@ bool lkca_hmac_duplicate(struct shash_desc *dst, struct shash_desc const *src)
struct crypto_shash *src_tfm = src->tfm;
struct crypto_shash *dst_tfm = dst->tfm;
int ss = crypto_shash_statesize(dst_tfm);
#ifdef NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT
char *src_ipad = crypto_tfm_ctx_aligned(&src_tfm->base);
char *dst_ipad = crypto_tfm_ctx_aligned(&dst_tfm->base);
int ss = crypto_shash_statesize(dst_tfm);
#else
int ctx_size = crypto_shash_alg(dst_tfm)->base.cra_ctxsize;
char *src_ipad = crypto_shash_ctx(src_tfm);
char *dst_ipad = crypto_shash_ctx(dst_tfm);
/*
* Actual struct definition is hidden, so I assume data we need is at
* the end. In 6.0 the struct has a pointer to crpyto_shash followed by:
* 'u8 ipad[statesize];', then 'u8 opad[statesize];'
*/
src_ipad += ctx_size - 2 * ss;
dst_ipad += ctx_size - 2 * ss;
#endif
memcpy(dst_ipad, src_ipad, crypto_shash_blocksize(src->tfm));
memcpy(dst_ipad + ss, src_ipad + ss, crypto_shash_blocksize(src->tfm));
crypto_shash_clear_flags(dst->tfm, CRYPTO_TFM_NEED_KEY);

View File

@@ -156,7 +156,7 @@ NvS32 NV_API_CALL nv_request_msix_irq(nv_linux_state_t *nvl)
{
for( j = 0; j < i; j++)
{
free_irq(nvl->msix_entries[i].vector, (void *)nvl);
free_irq(nvl->msix_entries[j].vector, (void *)nvl);
}
break;
}

View File

@@ -316,14 +316,14 @@ int nvidia_p2p_init_mapping(
return -ENOTSUPP;
}
EXPORT_SYMBOL(nvidia_p2p_init_mapping);
NV_EXPORT_SYMBOL(nvidia_p2p_init_mapping);
int nvidia_p2p_destroy_mapping(uint64_t p2p_token)
{
return -ENOTSUPP;
}
EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);
NV_EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);
static void nv_p2p_mem_info_free_callback(void *data)
{
@@ -506,8 +506,13 @@ static int nv_p2p_get_pages(
(*page_table)->page_size = page_size_index;
os_free_mem(physical_addresses);
physical_addresses = NULL;
os_free_mem(wreqmb_h);
wreqmb_h = NULL;
os_free_mem(rreqmb_h);
rreqmb_h = NULL;
if (free_callback != NULL)
{
@@ -582,7 +587,7 @@ int nvidia_p2p_get_pages(
p2p_token, va_space, virtual_address,
length, page_table, free_callback, data);
}
EXPORT_SYMBOL(nvidia_p2p_get_pages);
NV_EXPORT_SYMBOL(nvidia_p2p_get_pages);
int nvidia_p2p_get_pages_persistent(
uint64_t virtual_address,
@@ -600,7 +605,7 @@ int nvidia_p2p_get_pages_persistent(
virtual_address, length, page_table,
NULL, NULL);
}
EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
NV_EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
/*
* This function is a no-op, but is left in place (for now), in order to allow
@@ -613,7 +618,7 @@ int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table)
return 0;
}
EXPORT_SYMBOL(nvidia_p2p_free_page_table);
NV_EXPORT_SYMBOL(nvidia_p2p_free_page_table);
int nvidia_p2p_put_pages(
uint64_t p2p_token,
@@ -645,7 +650,7 @@ int nvidia_p2p_put_pages(
return nvidia_p2p_map_status(status);
}
EXPORT_SYMBOL(nvidia_p2p_put_pages);
NV_EXPORT_SYMBOL(nvidia_p2p_put_pages);
int nvidia_p2p_put_pages_persistent(
uint64_t virtual_address,
@@ -685,7 +690,7 @@ int nvidia_p2p_put_pages_persistent(
return nvidia_p2p_map_status(status);
}
EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);
NV_EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);
int nvidia_p2p_dma_map_pages(
struct pci_dev *peer,
@@ -800,7 +805,7 @@ failed:
return nvidia_p2p_map_status(status);
}
EXPORT_SYMBOL(nvidia_p2p_dma_map_pages);
NV_EXPORT_SYMBOL(nvidia_p2p_dma_map_pages);
int nvidia_p2p_dma_unmap_pages(
struct pci_dev *peer,
@@ -840,7 +845,7 @@ int nvidia_p2p_dma_unmap_pages(
return 0;
}
EXPORT_SYMBOL(nvidia_p2p_dma_unmap_pages);
NV_EXPORT_SYMBOL(nvidia_p2p_dma_unmap_pages);
/*
* This function is a no-op, but is left in place (for now), in order to allow
@@ -855,7 +860,7 @@ int nvidia_p2p_free_dma_mapping(
return 0;
}
EXPORT_SYMBOL(nvidia_p2p_free_dma_mapping);
NV_EXPORT_SYMBOL(nvidia_p2p_free_dma_mapping);
int nvidia_p2p_register_rsync_driver(
nvidia_p2p_rsync_driver_t *driver,
@@ -884,7 +889,7 @@ int nvidia_p2p_register_rsync_driver(
driver->wait_for_rsync, data);
}
EXPORT_SYMBOL(nvidia_p2p_register_rsync_driver);
NV_EXPORT_SYMBOL(nvidia_p2p_register_rsync_driver);
void nvidia_p2p_unregister_rsync_driver(
nvidia_p2p_rsync_driver_t *driver,
@@ -916,7 +921,7 @@ void nvidia_p2p_unregister_rsync_driver(
driver->wait_for_rsync, data);
}
EXPORT_SYMBOL(nvidia_p2p_unregister_rsync_driver);
NV_EXPORT_SYMBOL(nvidia_p2p_unregister_rsync_driver);
int nvidia_p2p_get_rsync_registers(
nvidia_p2p_rsync_reg_info_t **reg_info
@@ -1009,7 +1014,7 @@ int nvidia_p2p_get_rsync_registers(
return 0;
}
EXPORT_SYMBOL(nvidia_p2p_get_rsync_registers);
NV_EXPORT_SYMBOL(nvidia_p2p_get_rsync_registers);
void nvidia_p2p_put_rsync_registers(
nvidia_p2p_rsync_reg_info_t *reg_info
@@ -1041,4 +1046,4 @@ void nvidia_p2p_put_rsync_registers(
os_free_mem(reg_info);
}
EXPORT_SYMBOL(nvidia_p2p_put_rsync_registers);
NV_EXPORT_SYMBOL(nvidia_p2p_put_rsync_registers);

View File

@@ -1224,12 +1224,11 @@ static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)
rm_read_registry_dword(sp, nv, NV_REG_ENABLE_MSI, &msi_config);
if (msi_config == 1)
{
if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSIX))
if (nvl->pci_dev->msix_cap && rm_is_msix_allowed(sp, nv))
{
nv_init_msix(nv);
}
if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSI) &&
!(nv->flags & NV_FLAG_USES_MSIX))
if (nvl->pci_dev->msi_cap && !(nv->flags & NV_FLAG_USES_MSIX))
{
nv_init_msi(nv);
}

View File

@@ -195,6 +195,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += devm_clk_bulk_get_all
NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_task_ioprio
NV_CONFTEST_FUNCTION_COMPILE_TESTS += mdev_set_iommu_device
NV_CONFTEST_FUNCTION_COMPILE_TESTS += offline_and_remove_memory
NV_CONFTEST_FUNCTION_COMPILE_TESTS += crypto_tfm_ctx_aligned
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_of_node_to_nid
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_sme_active
@@ -215,6 +216,7 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_get_dram_num_channe
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_dram_types
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_pxm_to_node
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_screen_info
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_screen_info
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_i2c_bus_status
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_fuse_control_read
NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_get_platform

View File

@@ -46,6 +46,11 @@ NvlStatus nvlink_lib_unload(void);
*/
NvlStatus nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params *ctrl_params);
/*
* Gets number of devices with type deviceType
*/
NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
#ifdef __cplusplus
}
#endif

View File

@@ -28,6 +28,11 @@
#include "nv-time.h"
#include <linux/mmzone.h>
#include <linux/numa.h>
#include <linux/pid.h>
extern char *NVreg_TemporaryFilePath;
#define MAX_ERROR_STRING 512
@@ -1242,9 +1247,12 @@ void NV_API_CALL os_get_screen_info(
* SYSFB_SIMPLEFB registers a dummy framebuffer which does not contain the
* information required by os_get_screen_info(), therefore you need to
* fall back onto the screen_info structure.
*
* After commit b8466fe82b79 ("efi: move screen_info into efi init code")
* in v6.7, 'screen_info' is exported as GPL licensed symbol for ARM64.
*/
#if NV_IS_EXPORT_SYMBOL_PRESENT_screen_info
#if NV_CHECK_EXPORT_SYMBOL(screen_info)
/*
* If there is not a framebuffer console, return 0 size.
*
@@ -2122,6 +2130,43 @@ void NV_API_CALL os_nv_cap_close_fd
nv_cap_close_fd(fd);
}
/*
* Reads the total memory and free memory of a NUMA node from the kernel.
*/
NV_STATUS NV_API_CALL os_get_numa_node_memory_usage
(
NvS32 node_id,
NvU64 *free_memory_bytes,
NvU64 *total_memory_bytes
)
{
struct pglist_data *pgdat;
struct zone *zone;
NvU32 zone_id;
if (node_id >= MAX_NUMNODES)
{
nv_printf(NV_DBG_ERRORS, "Invalid NUMA node ID\n");
return NV_ERR_INVALID_ARGUMENT;
}
pgdat = NODE_DATA(node_id);
*free_memory_bytes = 0;
*total_memory_bytes = 0;
for (zone_id = 0; zone_id < MAX_NR_ZONES; zone_id++)
{
zone = &(pgdat->node_zones[zone_id]);
if (!populated_zone(zone))
continue;
*free_memory_bytes += (zone_page_state_snapshot(zone, NR_FREE_PAGES) * PAGE_SIZE);
*total_memory_bytes += (zone->present_pages * PAGE_SIZE);
}
return NV_OK;
}
typedef struct os_numa_gpu_mem_hotplug_notifier_s
{
NvU64 start_pa;
@@ -2373,3 +2418,28 @@ NV_STATUS NV_API_CALL os_offline_page_at_address
#endif
}
void* NV_API_CALL os_get_pid_info(void)
{
return get_task_pid(current, PIDTYPE_PID);
}
void NV_API_CALL os_put_pid_info(void *pid_info)
{
if (pid_info != NULL)
put_pid(pid_info);
}
NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid)
{
if ((pid_info == NULL) || (ns_pid == NULL))
return NV_ERR_INVALID_ARGUMENT;
*ns_pid = pid_vnr((struct pid *)pid_info);
// The call returns 0 if the PID is not found in the current ns
if (*ns_pid == 0)
return NV_ERR_OBJECT_NOT_FOUND;
return NV_OK;
}