535.43.02

This commit is contained in:
Andy Ritger
2023-05-30 10:11:36 -07:00
parent 6dd092ddb7
commit eb5c7665a1
1403 changed files with 295367 additions and 86235 deletions

View File

@@ -30,78 +30,62 @@
#include "uvm_test.h"
#include "uvm_test_ioctl.h"
#if defined(NV_LINUX_SCHED_MM_H_PRESENT)
#include <linux/sched/mm.h>
#elif defined(NV_LINUX_SCHED_H_PRESENT)
#include <linux/sched.h>
#endif
//
// This comment block describes some implementation rationale. See the header
// for the API descriptions.
//
// ========================= Retain count vs mm_users ==========================
//
// We use two methods to guarantee the mm is available and won't be destroyed.
// To guarantee the mm is available and won't be destroyed we require
// userspace to open a second file descriptor (uvm_mm_fd) and
// initialize it with uvm_api_mm_initialize(). During initialization
// we take a mm_users reference to ensure the mm remains valid until
// the file descriptor is closed.
//
// On the call paths where mmput() can be called, we call
// uvm_va_space_mm_or_current_retain() which calls mmget_not_zero(). This
// prevents mm teardown and avoids races with uvm_va_space_mm_shutdown() since
// it prevents mmput() -> __mmput() -> exit_mmap() -> mmu_notifier_release() ->
// uvm_va_space_mm_shutdown() until uvm_va_space_mm_or_current_release(), and
// we guarantee that we can't call uvm_va_space_mm_unregister() ->
// mmu_notifier_unregister() -> uvm_va_space_mm_shutdown() path when someone is
// about to call uvm_va_space_mm_or_current_retain().
// Kernel calls like mmu_interval_notifier_insert() require mm_users to be
// greater than 0. In general, these are the ioctl paths.
// To ensure userspace can't close the file descriptor and drop the
// mm_users refcount while it is in use threads must call either
// uvm_va_space_mm_retain() or uvm_va_space_mm_or_current_retain() to
// increment the retained count. This also checks that userspace has
// initialized the uvm_mm_fd and therefore holds a valid pagetable
// pin.
//
// On the replayable GPU fault handling path, we need the mm to be able to
// service faults in the window when mm_users == 0 but mmu_notifier_release()
// hasn't yet been called. We can't call mmput() because it may result in
// exit_mmap(), which could result in RM calls and VA space destroy. Those need
// to wait for the GPU fault handler to finish, so on that path we use an
// internal retained reference count and wait queue. When the mm is disabled
// via mmu_notifier_release(), we use the wait queue to wait for the reference
// count to go to 0.
// We also use this path for older Linux kernels where mm_users > 0 isn't
// required.
// Closing uvm_mm_fd will call uvm_va_space_mm_shutdown() prior to
// mmput() which ensures there are no active users of the mm. This
// indirection is required because not all threads can call mmput()
// directly. In particular the replayable GPU fault handling path
// can't call mmput() because it may result in exit_mmap() which could
// result in RM calls and VA space destroy and those need to wait for
// the GPU fault handler to finish.
//
// ============================ Handling mm teardown ===========================
//
// mmu_notifiers call the mm release callback both when the mm is really getting
// shut down, and whenever mmu_notifier_unregister is called. This has several
// consequences, including that these two paths can race. If they do race, they
// wait for each other to finish (real teardown of the mm won't start until the
// mmu_notifier_unregister's callback has returned, and mmu_notifier_unregister
// won't return until the mm release callback has returned).
// When the process is exiting we will get notified either via an
// explict close of uvm_mm_fd or implicitly as part of
// exit_files(). We are guaranteed to get this call because we don't
// allow mmap on uvm_mm_fd, and the userspace pagetables (mm_users)
// are guaranteed to exist because we hold a mm_users refcount
// which is released as part of file close.
//
// When the mm is really getting torn down, uvm_va_space_mm_shutdown is expected
// to stop all GPU memory accesses to that mm and stop servicing faults in that
// mm. This essentially shuts down the VA space for new work. The VA space
// object remains valid for most teardown ioctls until the file is closed,
// because it's legal for the associated process to die then for another process
// with a reference on the file to perform the unregisters or associated ioctls.
// This is particularly true for tools users.
// This allows any outstanding GPU faults to be processed. To prevent
// new faults occurring uvm_va_space_mm_shutdown() is called to stop
// all GPU memory accesses to the mm. Once all GPU memory has been
// stopped no new retainers of the va_space will be allowed and the
// mm_users reference will be dropped, potentially tearing down the mm
// and associated pagetables.
//
// This essentially shuts down the VA space for new work. The VA space
// object remains valid for most teardown ioctls until the file is
// closed, because it's legal for the associated process to die then
// for another process with a reference on the file to perform the
// unregisters or associated ioctls. This is particularly true for
// tools users.
//
// An exception to the above is UvmUnregisterChannel. Since channels are
// completely removed from the VA space on mm teardown, later channel
// unregisters will fail to find the handles and will return an error.
//
// The UVM driver will only call mmu_notifier_unregister during VA space destroy
// (file close).
//
// Here is a table of the various teardown scenarios:
//
// Can race with
// Scenario mm teardown
// -----------------------------------------------------------------------------
// 1) Process exit (mm teardown, file open) -
// 2) Explicit file close in original mm No
// 3) Explicit file close in different mm Yes
// 4) Implicit file close (exit) in original mm No
// 5) Implicit file close (exit) in different mm Yes
//
// At a high level, the sequence of operations to perform during mm teardown is:
// At a high level, the sequence of operations to perform prior to mm
// teardown is:
//
// 1) Stop all channels
// - Prevents new faults and accesses on non-MPS
@@ -121,7 +105,7 @@
// the case of MPS, cancel real faults after.
// 4) UnsetPageDir
// - Prevents new accesses on MPS
// 5) Mark the va_space_mm as dead
// 5) Mark the va_space_mm as released
// - Prevents new retainers from using the mm. There won't be any more on
// the fault handling paths, but there could be others in worker threads.
//
@@ -170,7 +154,7 @@
//
// =============================================================================
#define UVM_VA_SPACE_MM_SHUTDOWN_DELAY_MAX_MS 100
static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space);
static int uvm_enable_va_space_mm = 1;
module_param(uvm_enable_va_space_mm, int, S_IRUGO);
@@ -195,22 +179,7 @@ bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)
return uvm_va_space_mm_enabled_system();
}
static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space);
#if !defined(NV_MMGET_NOT_ZERO_PRESENT)
static bool mmget_not_zero(struct mm_struct *mm)
{
return atomic_inc_not_zero(&mm->mm_users);
}
#endif
#if UVM_CAN_USE_MMU_NOTIFIERS()
static void uvm_mmput(struct mm_struct *mm)
{
mmput(mm);
}
static uvm_va_space_t *get_va_space(struct mmu_notifier *mn)
{
// This may be called without a thread context present, so be careful
@@ -218,11 +187,6 @@ static bool mmget_not_zero(struct mm_struct *mm)
return container_of(mn, uvm_va_space_t, va_space_mm.mmu_notifier);
}
static void uvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
UVM_ENTRY_VOID(uvm_va_space_mm_shutdown(get_va_space(mn)));
}
static void uvm_mmu_notifier_invalidate_range_ats(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
@@ -249,14 +213,8 @@ static bool mmget_not_zero(struct mm_struct *mm)
UVM_ENTRY_VOID(uvm_ats_invalidate(get_va_space(mn), start, end));
}
static struct mmu_notifier_ops uvm_mmu_notifier_ops_release =
{
.release = uvm_mmu_notifier_release,
};
static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
{
.release = uvm_mmu_notifier_release,
.invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
};
@@ -265,11 +223,7 @@ static bool mmget_not_zero(struct mm_struct *mm)
UVM_ASSERT(va_space_mm->mm);
uvm_assert_mmap_lock_locked_write(va_space_mm->mm);
if (UVM_ATS_IBM_SUPPORTED_IN_DRIVER() && g_uvm_global.ats.enabled)
va_space_mm->mmu_notifier.ops = &uvm_mmu_notifier_ops_ats;
else
va_space_mm->mmu_notifier.ops = &uvm_mmu_notifier_ops_release;
va_space_mm->mmu_notifier.ops = &uvm_mmu_notifier_ops_ats;
return __mmu_notifier_register(&va_space_mm->mmu_notifier, va_space_mm->mm);
}
@@ -278,11 +232,6 @@ static bool mmget_not_zero(struct mm_struct *mm)
mmu_notifier_unregister(&va_space_mm->mmu_notifier, va_space_mm->mm);
}
#else
static void uvm_mmput(struct mm_struct *mm)
{
UVM_ASSERT(0);
}
static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)
{
UVM_ASSERT(0);
@@ -303,26 +252,28 @@ NV_STATUS uvm_va_space_mm_register(uvm_va_space_t *va_space)
uvm_assert_mmap_lock_locked_write(current->mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
va_space_mm->state = UVM_VA_SPACE_MM_STATE_UNINITIALIZED;
if (!uvm_va_space_mm_enabled(va_space))
return NV_OK;
UVM_ASSERT(!va_space_mm->mm);
va_space_mm->mm = current->mm;
uvm_mmgrab(va_space_mm->mm);
// We must be prepared to handle callbacks as soon as we make this call,
// except for ->release() which can't be called since the mm belongs to
// current.
ret = uvm_mmu_notifier_register(va_space_mm);
if (ret) {
// Inform uvm_va_space_mm_unregister() that it has nothing to do.
va_space_mm->mm = NULL;
return errno_to_nv_status(ret);
if (UVM_ATS_IBM_SUPPORTED_IN_DRIVER() && g_uvm_global.ats.enabled) {
ret = uvm_mmu_notifier_register(va_space_mm);
if (ret) {
// Inform uvm_va_space_mm_unregister() that it has nothing to do.
uvm_mmdrop(va_space_mm->mm);
va_space_mm->mm = NULL;
return errno_to_nv_status(ret);
}
}
uvm_spin_lock(&va_space_mm->lock);
va_space_mm->alive = true;
uvm_spin_unlock(&va_space_mm->lock);
return NV_OK;
}
@@ -330,25 +281,35 @@ void uvm_va_space_mm_unregister(uvm_va_space_t *va_space)
{
uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
// We can't hold the VA space lock or mmap_lock across this function since
// mmu_notifier_unregister() may trigger uvm_va_space_mm_shutdown(), which
// takes those locks and also waits for other threads which may take those
// locks.
// We can't hold the VA space lock or mmap_lock because
// uvm_va_space_mm_shutdown() waits for retainers which may take
// these locks.
uvm_assert_unlocked_order(UVM_LOCK_ORDER_MMAP_LOCK);
uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_SPACE);
uvm_va_space_mm_shutdown(va_space);
UVM_ASSERT(va_space_mm->retained_count == 0);
// Only happens if uvm_va_space_mm_register() fails
if (!va_space_mm->mm)
return;
UVM_ASSERT(uvm_va_space_mm_enabled(va_space));
uvm_mmu_notifier_unregister(va_space_mm);
// At this point the mm is still valid because uvm_mm_release()
// hasn't yet called mmput(). uvm_hmm_va_space_destroy() will kill
// all the va_blocks along with any associated gpu_chunks, so we
// need to make sure these chunks are free. However freeing them
// requires a valid mm so we can call migrate_vma_setup(), so we
// do that here.
// TODO: Bug 3902536: [UVM-HMM] add code to migrate GPU memory
// without having a va_block
if (uvm_hmm_is_enabled(va_space))
uvm_hmm_evict_va_blocks(va_space);
// We're guaranteed that upon return from mmu_notifier_unregister(),
// uvm_va_space_mm_shutdown() will have been called (though perhaps not by
// this thread). Therefore all retainers have been flushed.
UVM_ASSERT(!va_space_mm->alive);
UVM_ASSERT(va_space_mm->retained_count == 0);
va_space_mm->mm = NULL;
if (uvm_va_space_mm_enabled(va_space)) {
if (UVM_ATS_IBM_SUPPORTED_IN_DRIVER() && g_uvm_global.ats.enabled)
uvm_mmu_notifier_unregister(va_space_mm);
uvm_mmdrop(va_space_mm->mm);
}
}
struct mm_struct *uvm_va_space_mm_retain(uvm_va_space_t *va_space)
@@ -361,11 +322,19 @@ struct mm_struct *uvm_va_space_mm_retain(uvm_va_space_t *va_space)
uvm_spin_lock(&va_space_mm->lock);
if (va_space_mm->alive) {
++va_space_mm->retained_count;
mm = va_space_mm->mm;
UVM_ASSERT(mm);
}
if (!uvm_va_space_mm_alive(va_space_mm))
goto out;
++va_space_mm->retained_count;
mm = va_space_mm->mm;
UVM_ASSERT(mm);
out:
// uvm_api_mm_init() holds a reference
if (mm)
UVM_ASSERT(atomic_read(&mm->mm_users) > 0);
uvm_spin_unlock(&va_space_mm->lock);
@@ -374,8 +343,6 @@ struct mm_struct *uvm_va_space_mm_retain(uvm_va_space_t *va_space)
struct mm_struct *uvm_va_space_mm_or_current_retain(uvm_va_space_t *va_space)
{
uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
// We should only attempt to use current->mm from a user thread
UVM_ASSERT(!(current->flags & PF_KTHREAD));
@@ -384,19 +351,22 @@ struct mm_struct *uvm_va_space_mm_or_current_retain(uvm_va_space_t *va_space)
if (!current->mm)
return NULL;
// If the va_space_mm matches current->mm then it would be safe but sub-
// optimal to call mmget_not_zero(). current->mm is always valid to
// use when non-NULL so there is no need to retain it.
if (!uvm_va_space_mm_enabled(va_space) || va_space_mm->mm == current->mm)
// If !uvm_va_space_mm_enabled() we use current->mm on the ioctl
// paths. In that case we don't need to mmget(current->mm) because
// the current thread mm is always valid. On
// uvm_va_space_mm_enabled() systems we skip trying to retain the
// mm if it is current->mm because userspace may not have
// initialised the mm fd but UVM callers on the ioctl path still
// assume retaining current->mm will succeed.
if (!uvm_va_space_mm_enabled(va_space))
return current->mm;
return mmget_not_zero(va_space_mm->mm) ? va_space_mm->mm : NULL;
return uvm_va_space_mm_retain(va_space);
}
void uvm_va_space_mm_release(uvm_va_space_t *va_space)
{
uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
bool do_wake = false;
UVM_ASSERT(uvm_va_space_mm_enabled(va_space));
@@ -409,84 +379,26 @@ void uvm_va_space_mm_release(uvm_va_space_t *va_space)
--va_space_mm->retained_count;
// If we're the last retainer on a dead mm, signal any potential waiters
if (va_space_mm->retained_count == 0 && !va_space_mm->alive)
do_wake = true;
if (va_space_mm->retained_count == 0 && !uvm_va_space_mm_alive(va_space_mm)) {
uvm_spin_unlock(&va_space_mm->lock);
uvm_spin_unlock(&va_space_mm->lock);
// There could be multiple threads in uvm_va_space_mm_shutdown() waiting on
// us, so we have to wake up all waiters.
if (do_wake)
wake_up_all(&va_space_mm->last_retainer_wait_queue);
// There could be a thread in uvm_va_space_mm_shutdown()
// waiting on us, so wake it up.
wake_up(&va_space_mm->last_retainer_wait_queue);
}
else {
uvm_spin_unlock(&va_space_mm->lock);
}
}
void uvm_va_space_mm_or_current_release(uvm_va_space_t *va_space, struct mm_struct *mm)
{
// We can't hold the VA space lock or mmap_lock across this function since
// mmput() may trigger uvm_va_space_mm_shutdown(), which takes those locks
// and also waits for other threads which may take those locks.
uvm_assert_unlocked_order(UVM_LOCK_ORDER_MMAP_LOCK);
uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_SPACE);
if (mm && mm != current->mm)
uvm_mmput(mm);
}
static void uvm_va_space_mm_shutdown_delay(uvm_va_space_t *va_space)
{
uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
NvU64 start_time;
int num_threads;
bool timed_out = false;
if (!va_space_mm->test.delay_shutdown)
if (!uvm_va_space_mm_enabled(va_space) || !mm)
return;
start_time = NV_GETTIME();
num_threads = atomic_inc_return(&va_space_mm->test.num_mm_shutdown_threads);
UVM_ASSERT(num_threads > 0);
if (num_threads == 1) {
// Wait for another thread to arrive unless we time out
while (atomic_read(&va_space_mm->test.num_mm_shutdown_threads) == 1) {
if (NV_GETTIME() - start_time >= 1000*1000*UVM_VA_SPACE_MM_SHUTDOWN_DELAY_MAX_MS) {
timed_out = true;
break;
}
}
if (va_space_mm->test.verbose)
UVM_TEST_PRINT("Multiple threads: %d\n", !timed_out);
}
// No need to decrement num_mm_shutdown_threads since this va_space_mm is
// being shut down.
uvm_va_space_mm_release(va_space);
}
// Handles the va_space's mm being torn down while the VA space still exists.
// This function won't return until all in-flight retainers have called
// uvm_va_space_mm_release(). Subsequent calls to uvm_va_space_mm_retain() will
// return NULL.
//
// uvm_va_space_mm_unregister() must still be called. It is guaranteed that
// uvm_va_space_mm_shutdown() will not be called after
// uvm_va_space_mm_unregister() returns, though they may execute concurrently.
// If so, uvm_va_space_mm_unregister() will not return until
// uvm_va_space_mm_shutdown() is done.
//
// After this call returns the VA space is essentially dead. GPUs cannot make
// any new memory accesses in registered GPU VA spaces, and no more GPU faults
// which are attributed to this VA space will arrive. Additionally, no more
// registration within the VA space is allowed (GPU, GPU VA space, or channel).
//
// The requirements for this callback are that, once we return, the GPU and
// driver are completely done using the associated mm_struct. This includes:
//
// 1) GPUs will not issue any more memory accesses under this mm
// 2) [ATS only] GPUs will not issue any more ATRs under this mm
// 3) The driver will not ask the kernel to service faults on this mm
//
static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space)
{
uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
@@ -495,29 +407,6 @@ static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space)
uvm_global_processor_mask_t gpus_to_flush;
LIST_HEAD(deferred_free_list);
// The mm must not have been torn down completely yet, but it may have been
// marked as dead by a concurrent thread.
UVM_ASSERT(uvm_va_space_mm_enabled(va_space));
UVM_ASSERT(va_space_mm->mm);
// Inject a delay for testing if requested
uvm_va_space_mm_shutdown_delay(va_space);
// There can be at most two threads here concurrently:
//
// 1) Thread A in process teardown of the original process
//
// 2) Thread B must be in the file close path of another process (either
// implicit or explicit), having already stopped all GPU accesses and
// having called uvm_va_space_mm_unregister.
//
// This corresponds to scenario #5 in the mm teardown block comment at the
// top of the file. We serialize between these threads with the VA space
// lock, but otherwise don't have any special handling: both threads will
// execute the full teardown sequence below. Also, remember that the threads
// won't return to their callers until both threads have returned from this
// function (following the rules for mmu_notifier_unregister).
uvm_va_space_down_write(va_space);
// Prevent future registrations of any kind. We'll be iterating over all
@@ -545,9 +434,9 @@ static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space)
uvm_global_mask_retain(&gpus_to_flush);
uvm_va_space_up_write(va_space);
// Flush the fault buffer on all GPUs. This will avoid spurious cancels
// of stale pending translated faults after we clear va_space_mm->alive
// later.
// Flush the fault buffer on all GPUs. This will avoid spurious
// cancels of stale pending translated faults after we set
// UVM_VA_SPACE_MM_STATE_RELEASED later.
for_each_global_gpu_in_mask(gpu, &gpus_to_flush)
uvm_gpu_fault_buffer_flush(gpu);
@@ -593,7 +482,7 @@ static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space)
// Now that there won't be any new GPU faults, prevent subsequent retainers
// from accessing this mm.
uvm_spin_lock(&va_space_mm->lock);
va_space_mm->alive = false;
va_space_mm->state = UVM_VA_SPACE_MM_STATE_RELEASED;
uvm_spin_unlock(&va_space_mm->lock);
// Finish channel destroy. This can be done at any point after detach as
@@ -666,26 +555,6 @@ NV_STATUS uvm_test_va_space_mm_retain(UVM_TEST_VA_SPACE_MM_RETAIN_PARAMS *params
return status;
}
NV_STATUS uvm_test_va_space_mm_delay_shutdown(UVM_TEST_VA_SPACE_MM_DELAY_SHUTDOWN_PARAMS *params, struct file *filp)
{
uvm_va_space_t *va_space = uvm_va_space_get(filp);
uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
NV_STATUS status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
uvm_va_space_down_write(va_space);
if (uvm_va_space_mm_retain(va_space)) {
va_space_mm->test.delay_shutdown = true;
va_space_mm->test.verbose = params->verbose;
uvm_va_space_mm_release(va_space);
status = NV_OK;
}
uvm_va_space_up_write(va_space);
return status;
}
NV_STATUS uvm_test_va_space_mm_or_current_retain(UVM_TEST_VA_SPACE_MM_OR_CURRENT_RETAIN_PARAMS *params,
struct file *filp)
{
@@ -704,13 +573,9 @@ NV_STATUS uvm_test_va_space_mm_or_current_retain(UVM_TEST_VA_SPACE_MM_OR_CURRENT
status = NV_ERR_INVALID_ARGUMENT;
}
if (status == NV_OK) {
if (params->sleep_us)
if (status == NV_OK && params->sleep_us)
usleep_range(params->sleep_us, params->sleep_us + 1000);
params->mm_users = atomic_read(&mm->mm_users);
}
uvm_va_space_mm_or_current_release(va_space, mm);
return status;