515.43.04

2026-04-22 15:29:04 +00:00 · 2022-05-09 13:18:59 -07:00
commit 1739a20efc
2519 changed files with 1060036 additions and 0 deletions
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@@ -0,0 +1,683 @@
+/*******************************************************************************
+    Copyright (c) 2017-2021 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+*******************************************************************************/
+
+#include "nv_uvm_interface.h"
+#include "uvm_common.h"
+#include "uvm_api.h"
+#include "uvm_gpu_non_replayable_faults.h"
+#include "uvm_gpu.h"
+#include "uvm_hal.h"
+#include "uvm_lock.h"
+#include "uvm_tools.h"
+#include "uvm_user_channel.h"
+#include "uvm_va_space_mm.h"
+#include "uvm_va_block.h"
+#include "uvm_va_range.h"
+#include "uvm_kvmalloc.h"
+#include "uvm_ats_faults.h"
+
+// In the context of a CUDA application using Unified Memory, it is sometimes
+// assumed that there is a single type of fault, originated by a memory
+// load/store in a SM (Graphics Engine), which itself can be traced back to a
+// memory access in a CUDA kernel written by a developer. In reality, faults can
+// also be triggered by other parts of the GPU i.e. by other engines, as the
+// result of developer-facing APIs, or operations initiated by a user-mode
+// driver. The Graphics Engine faults are called replayable faults, while the
+// rest are called non-replayable. The differences between the two types of
+// faults go well beyond the engine originating the fault.
+//
+// A non-replayable fault originates in an engine other than Graphics. UVM
+// services non-replayable faults from the Copy and PBDMA (Host/ESCHED) Engines.
+// Non-replayable faults originated in other engines are considered fatal, and
+// do not reach the UVM driver. While UVM can distinguish between faults
+// originated in the Copy Engine and faults originated in the PBDMA Engine, in
+// practice they are all processed in the same way. Replayable fault support in
+// Graphics was introduced in Pascal, and non-replayable fault support in CE and
+// PBDMA Engines was introduced in Volta; all non-replayable faults were fatal
+// before Volta.
+//
+// An example of a Copy Engine non-replayable fault is a memory copy between two
+// virtual addresses on a GPU, in which either the source or destination
+// pointers are not currently mapped to a physical address in the page tables of
+// the GPU. An example of a PBDMA non-replayable fault is a semaphore acquire in
+// which the semaphore virtual address passed as argument is currently not
+// mapped to any physical address.
+//
+// Non-replayable faults originated in the CE and PBDMA Engines result in HW
+// preempting the channel associated with the fault, a mechanism called "fault
+// and switch". More precisely, the switching out affects not only the channel
+// that caused the fault, but all the channels in the same Time Slice Group
+// (TSG). SW intervention is required so all the channels in the TSG can be
+// scheduled again, but channels in other TSGs can be scheduled and resume their
+// normal execution. In the case of the non-replayable faults serviced by UVM,
+// the driver clears a channel's faulted bit upon successful servicing, but it
+// is only when the servicing has completed for all the channels in the TSG that
+// they are all allowed to be switched in.  Non-replayable faults originated in
+// engines other than CE and PBDMA are fatal because these other units lack
+// hardware support for the "fault and switch" and restart mechanisms just
+// described.
+// On the other hand, replayable faults block preemption of the channel until
+// software (UVM) services the fault. This is sometimes known as "fault and
+// stall". Note that replayable faults prevent the execution of other channels,
+// which are stalled until the fault is serviced.
+//
+// The "non-replayable" naming alludes to the fact that, historically, these
+// faults indicated a fatal condition so there was no recovery ("replay")
+// process, and SW could not ignore or drop the fault. As discussed before, this
+// is no longer the case and while at times the hardware documentation uses the
+// "fault and replay" expression for CE and PBDMA faults, we reserve that
+// expression for Graphics faults and favor the term "fault and reschedule"
+// instead. Replaying a fault does not necessarily imply that UVM has serviced
+// it. For example, the UVM driver may choose to ignore the replayable faults
+// associated with a GPU for some period of time if it detects that there is
+// thrashing going on, and the GPU needs to be throttled. The fault entries
+// corresponding to the ignored faults are never saved by UVM, but new entries
+// (and new interrupts) will be generated by hardware each time after UVM issues
+// a replay.
+//
+// While replayable faults are always the responsibility of UVM, the servicing
+// of non-replayable faults is split between RM and UVM. In the case of
+// replayable faults, UVM has sole SW ownership of the hardware buffer
+// containing the faults, and it is responsible for updating the GET pointer to
+// signal the hardware that a number of faults have been read. UVM also reads
+// the PUT pointer value written by hardware. But in the case of non-replayable
+// faults, UVM reads the fault entries out of a regular CPU buffer, shared with
+// RM, called "shadow buffer". RM is responsible for accessing the actual
+// non-replayable hardware buffer, reading the PUT pointer, updating the GET
+// pointer, and moving CE and PBDMA faults from the hardware buffer to the
+// shadow buffer. Because the Resource Manager owns the HW buffer, UVM needs to
+// call RM when servicing a non-replayable fault, first to figure out if there
+// is a pending fault, and then to read entries from the shadow buffer.
+//
+// Once UVM has parsed a non-replayable fault entry corresponding to managed
+// memory, and identified the VA block associated with it, the servicing logic
+// for that block is identical to that of a replayable fault, see
+// uvm_va_block_service_locked. Another similarity between the two types of
+// faults is that they use the same entry format, uvm_fault_buffer_entry_t.
+
+
+// There is no error handling in this function. The caller is in charge of
+// calling uvm_gpu_fault_buffer_deinit_non_replayable_faults on failure.
+NV_STATUS uvm_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
+
+    UVM_ASSERT(parent_gpu->non_replayable_faults_supported);
+
+    non_replayable_faults->shadow_buffer_copy = NULL;
+    non_replayable_faults->fault_cache        = NULL;
+
+    non_replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize /
+                                        parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
+
+    non_replayable_faults->shadow_buffer_copy =
+        uvm_kvmalloc_zero(parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize);
+    if (!non_replayable_faults->shadow_buffer_copy)
+        return NV_ERR_NO_MEMORY;
+
+    non_replayable_faults->fault_cache = uvm_kvmalloc_zero(non_replayable_faults->max_faults *
+                                                           sizeof(*non_replayable_faults->fault_cache));
+    if (!non_replayable_faults->fault_cache)
+        return NV_ERR_NO_MEMORY;
+
+    uvm_tracker_init(&non_replayable_faults->clear_faulted_tracker);
+    uvm_tracker_init(&non_replayable_faults->fault_service_tracker);
+
+    return NV_OK;
+}
+
+void uvm_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
+
+    if (non_replayable_faults->fault_cache) {
+        UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->clear_faulted_tracker));
+        uvm_tracker_deinit(&non_replayable_faults->clear_faulted_tracker);
+
+        UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->fault_service_tracker));
+        uvm_tracker_deinit(&non_replayable_faults->fault_service_tracker);
+    }
+
+    uvm_kvfree(non_replayable_faults->shadow_buffer_copy);
+    uvm_kvfree(non_replayable_faults->fault_cache);
+    non_replayable_faults->shadow_buffer_copy = NULL;
+    non_replayable_faults->fault_cache        = NULL;
+}
+
+bool uvm_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
+{
+    NV_STATUS status;
+    NvBool has_pending_faults;
+
+    UVM_ASSERT(parent_gpu->isr.non_replayable_faults.handling);
+
+    status = nvUvmInterfaceHasPendingNonReplayableFaults(&parent_gpu->fault_buffer_info.rm_info,
+                                                         &has_pending_faults);
+    UVM_ASSERT(status == NV_OK);
+
+    return has_pending_faults == NV_TRUE;
+}
+
+static NvU32 fetch_non_replayable_fault_buffer_entries(uvm_gpu_t *gpu)
+{
+    NV_STATUS status;
+    NvU32 i = 0;
+    NvU32 cached_faults = 0;
+    uvm_fault_buffer_entry_t *fault_cache;
+    NvU32 entry_size = gpu->parent->fault_buffer_hal->entry_size(gpu->parent);
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
+    char *current_hw_entry = (char *)non_replayable_faults->shadow_buffer_copy;
+
+    fault_cache = non_replayable_faults->fault_cache;
+
+    UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.non_replayable_faults.service_lock));
+    UVM_ASSERT(gpu->parent->non_replayable_faults_supported);
+
+    status = nvUvmInterfaceGetNonReplayableFaults(&gpu->parent->fault_buffer_info.rm_info,
+                                                  non_replayable_faults->shadow_buffer_copy,
+                                                  &cached_faults);
+    UVM_ASSERT(status == NV_OK);
+
+    // Parse all faults
+    for (i = 0; i < cached_faults; ++i) {
+        uvm_fault_buffer_entry_t *fault_entry = &non_replayable_faults->fault_cache[i];
+
+        gpu->parent->fault_buffer_hal->parse_non_replayable_entry(gpu->parent, current_hw_entry, fault_entry);
+
+        // The GPU aligns the fault addresses to 4k, but all of our tracking is
+        // done in PAGE_SIZE chunks which might be larger.
+        fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address);
+
+        // Make sure that all fields in the entry are properly initialized
+        fault_entry->va_space = NULL;
+        fault_entry->is_fatal = (fault_entry->fault_type >= UVM_FAULT_TYPE_FATAL);
+        fault_entry->filtered = false;
+
+        fault_entry->num_instances = 1;
+        fault_entry->access_type_mask = uvm_fault_access_type_mask_bit(fault_entry->fault_access_type);
+        INIT_LIST_HEAD(&fault_entry->merged_instances_list);
+        fault_entry->non_replayable.buffer_index = i;
+
+        if (fault_entry->is_fatal) {
+            // Record the fatal fault event later as we need the va_space locked
+            fault_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType;
+        }
+        else {
+            fault_entry->fatal_reason = UvmEventFatalReasonInvalid;
+        }
+
+        current_hw_entry += entry_size;
+    }
+
+    return cached_faults;
+}
+
+// In SRIOV, the UVM (guest) driver does not have access to the privileged
+// registers used to clear the faulted bit. Instead, UVM requests host RM to do
+// the clearing on its behalf, using a SW method.
+static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
+{
+    if (uvm_gpu_is_virt_mode_sriov(gpu)) {
+        UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
+        return true;
+    }
+
+    return false;
+}
+
+static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
+                                             uvm_user_channel_t *user_channel,
+                                             const uvm_fault_buffer_entry_t *fault_entry,
+                                             NvU32 batch_id,
+                                             uvm_tracker_t *tracker)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
+
+    UVM_ASSERT(!fault_entry->is_fatal);
+
+    status = uvm_push_begin_acquire(gpu->channel_manager,
+                                    UVM_CHANNEL_TYPE_MEMOPS,
+                                    tracker,
+                                    &push,
+                                    "Clearing set bit for address 0x%llx",
+                                    fault_entry->fault_address);
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("Error acquiring tracker before clearing faulted: %s, GPU %s\n",
+                      nvstatusToString(status),
+                      uvm_gpu_name(gpu));
+        return status;
+    }
+
+    if (use_clear_faulted_channel_sw_method(gpu))
+        gpu->parent->host_hal->clear_faulted_channel_sw_method(&push, user_channel, fault_entry);
+    else
+        gpu->parent->host_hal->clear_faulted_channel_method(&push, user_channel, fault_entry);
+
+    uvm_tools_broadcast_replay(gpu, &push, batch_id, fault_entry->fault_source.client_type);
+
+    uvm_push_end(&push);
+
+    // Add this push to the GPU's clear_faulted_tracker so GPU removal can wait
+    // on it.
+    status = uvm_tracker_add_push_safe(&non_replayable_faults->clear_faulted_tracker, &push);
+
+    // Add this push to the channel's clear_faulted_tracker so user channel
+    // removal can wait on it instead of using the per-GPU tracker, which would
+    // require a lock.
+    if (status == NV_OK)
+        status = uvm_tracker_add_push_safe(&user_channel->clear_faulted_tracker, &push);
+
+    return status;
+}
+
+static NV_STATUS clear_faulted_register_on_gpu(uvm_gpu_t *gpu,
+                                               uvm_user_channel_t *user_channel,
+                                               const uvm_fault_buffer_entry_t *fault_entry,
+                                               NvU32 batch_id,
+                                               uvm_tracker_t *tracker)
+{
+    NV_STATUS status;
+
+    UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);
+
+    // We need to wait for all pending work before writing to the channel
+    // register
+    status = uvm_tracker_wait(tracker);
+    if (status != NV_OK)
+        return status;
+
+    gpu->parent->host_hal->clear_faulted_channel_register(user_channel, fault_entry);
+
+    uvm_tools_broadcast_replay_sync(gpu, batch_id, fault_entry->fault_source.client_type);
+
+    return NV_OK;
+}
+
+static NV_STATUS clear_faulted_on_gpu(uvm_gpu_t *gpu,
+                                      uvm_user_channel_t *user_channel,
+                                      const uvm_fault_buffer_entry_t *fault_entry,
+                                      NvU32 batch_id,
+                                      uvm_tracker_t *tracker)
+{
+    if (gpu->parent->has_clear_faulted_channel_method || use_clear_faulted_channel_sw_method(gpu))
+        return clear_faulted_method_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
+
+    return clear_faulted_register_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
+}
+
+static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
+                                                       uvm_va_block_t *va_block,
+                                                       uvm_va_block_retry_t *va_block_retry,
+                                                       uvm_fault_buffer_entry_t *fault_entry,
+                                                       uvm_service_block_context_t *service_context)
+{
+    NV_STATUS status = NV_OK;
+    uvm_page_index_t page_index;
+    uvm_perf_thrashing_hint_t thrashing_hint;
+    uvm_processor_id_t new_residency;
+    bool read_duplicate;
+    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
+    uvm_va_range_t *va_range = va_block->va_range;
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
+
+    UVM_ASSERT(!fault_entry->is_fatal);
+
+    uvm_assert_rwsem_locked(&va_space->lock);
+
+    UVM_ASSERT(fault_entry->va_space == va_space);
+    UVM_ASSERT(fault_entry->fault_address >= va_block->start);
+    UVM_ASSERT(fault_entry->fault_address <= va_block->end);
+
+    service_context->block_context.policy = uvm_va_policy_get(va_block, fault_entry->fault_address);
+
+    if (service_context->num_retries == 0) {
+        // notify event to tools/performance heuristics. For now we use a
+        // unique batch id per fault, since we clear the faulted channel for
+        // each fault.
+        uvm_perf_event_notify_gpu_fault(&va_space->perf_events,
+                                        va_block,
+                                        gpu->id,
+                                        service_context->block_context.policy->preferred_location,
+                                        fault_entry,
+                                        ++non_replayable_faults->batch_id,
+                                        false);
+    }
+
+    // Check logical permissions
+    status = uvm_va_range_check_logical_permissions(va_range,
+                                                    gpu->id,
+                                                    fault_entry->fault_access_type,
+                                                    uvm_range_group_address_migratable(va_space,
+                                                                                       fault_entry->fault_address));
+    if (status != NV_OK) {
+        fault_entry->is_fatal = true;
+        fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
+        return NV_OK;
+    }
+
+    // TODO: Bug 1880194: Revisit thrashing detection
+    thrashing_hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
+
+    service_context->read_duplicate_count = 0;
+    service_context->thrashing_pin_count = 0;
+
+    page_index = uvm_va_block_cpu_page_index(va_block, fault_entry->fault_address);
+
+    // Compute new residency and update the masks
+    new_residency = uvm_va_block_select_residency(va_block,
+                                                  page_index,
+                                                  gpu->id,
+                                                  fault_entry->access_type_mask,
+                                                  service_context->block_context.policy,
+                                                  &thrashing_hint,
+                                                  UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS,
+                                                  &read_duplicate);
+
+    // Initialize the minimum necessary state in the fault service context
+    uvm_processor_mask_zero(&service_context->resident_processors);
+
+    // Set new residency and update the masks
+    uvm_processor_mask_set(&service_context->resident_processors, new_residency);
+
+    // The masks need to be fully zeroed as the fault region may grow due to prefetching
+    uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
+    uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
+
+    if (read_duplicate) {
+        uvm_page_mask_zero(&service_context->read_duplicate_mask);
+        uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
+        service_context->read_duplicate_count = 1;
+    }
+
+    service_context->access_type[page_index] = fault_entry->fault_access_type;
+
+    service_context->region = uvm_va_block_region_for_page(page_index);
+
+    status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, service_context);
+
+    ++service_context->num_retries;
+
+    return status;
+}
+
+static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
+                                                struct mm_struct *mm,
+                                                uvm_va_block_t *va_block,
+                                                uvm_fault_buffer_entry_t *fault_entry)
+{
+    NV_STATUS status, tracker_status;
+    uvm_va_block_retry_t va_block_retry;
+    uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.non_replayable.block_service_context;
+
+    service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS;
+    service_context->num_retries = 0;
+    service_context->block_context.mm = mm;
+
+    uvm_mutex_lock(&va_block->lock);
+
+    status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
+                                       service_managed_fault_in_block_locked(gpu,
+                                                                             va_block,
+                                                                             &va_block_retry,
+                                                                             fault_entry,
+                                                                             service_context));
+
+    tracker_status = uvm_tracker_add_tracker_safe(&gpu->parent->fault_buffer_info.non_replayable.fault_service_tracker,
+                                                  &va_block->tracker);
+
+    uvm_mutex_unlock(&va_block->lock);
+
+    return status == NV_OK? tracker_status: status;
+}
+
+// See uvm_unregister_channel for comments on the the channel destruction
+// sequence.
+static void kill_channel_delayed(void *_user_channel)
+{
+    uvm_user_channel_t *user_channel = (uvm_user_channel_t *)_user_channel;
+    uvm_va_space_t *va_space = user_channel->kill_channel.va_space;
+
+    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);
+
+    uvm_va_space_down_read_rm(va_space);
+    if (user_channel->gpu_va_space) {
+        // RM handles the fault, which will do the correct fault reporting in the
+        // kernel logs and will initiate channel teardown
+        NV_STATUS status = nvUvmInterfaceReportNonReplayableFault(uvm_gpu_device_handle(user_channel->gpu),
+                                                                  user_channel->kill_channel.fault_packet);
+        UVM_ASSERT(status == NV_OK);
+    }
+    uvm_va_space_up_read_rm(va_space);
+
+    uvm_user_channel_release(user_channel);
+}
+
+static void kill_channel_delayed_entry(void *user_channel)
+{
+    UVM_ENTRY_VOID(kill_channel_delayed(user_channel));
+}
+
+static void schedule_kill_channel(uvm_gpu_t *gpu,
+                                  uvm_fault_buffer_entry_t *fault_entry,
+                                  uvm_user_channel_t *user_channel)
+{
+    uvm_va_space_t *va_space = fault_entry->va_space;
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
+    void *packet = (char *)non_replayable_faults->shadow_buffer_copy +
+                   (fault_entry->non_replayable.buffer_index * gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
+
+    UVM_ASSERT(gpu);
+    UVM_ASSERT(va_space);
+    UVM_ASSERT(user_channel);
+
+    if (user_channel->kill_channel.scheduled)
+        return;
+
+    user_channel->kill_channel.scheduled = true;
+    user_channel->kill_channel.va_space = va_space;
+
+    // Save the packet to be handled by RM in the channel structure
+    memcpy(user_channel->kill_channel.fault_packet, packet, gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
+
+    // Retain the channel here so it is not prematurely destroyed. It will be
+    // released after forwarding the fault to RM in kill_channel_delayed.
+    uvm_user_channel_retain(user_channel);
+
+    // Schedule a work item to kill the channel
+    nv_kthread_q_item_init(&user_channel->kill_channel.kill_channel_q_item,
+                           kill_channel_delayed_entry,
+                           user_channel);
+
+    nv_kthread_q_schedule_q_item(&gpu->parent->isr.kill_channel_q,
+                                 &user_channel->kill_channel.kill_channel_q_item);
+}
+
+static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
+                                           struct mm_struct *mm,
+                                           uvm_fault_buffer_entry_t *fault_entry,
+                                           NV_STATUS lookup_status)
+{
+    uvm_gpu_t *gpu = gpu_va_space->gpu;
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
+    uvm_ats_fault_invalidate_t *ats_invalidate = &non_replayable_faults->ats_invalidate;
+    NV_STATUS status = lookup_status;
+
+    UVM_ASSERT(!fault_entry->is_fatal);
+
+    // Avoid dropping fault events when the VA block is not found or cannot be created
+    uvm_perf_event_notify_gpu_fault(&fault_entry->va_space->perf_events,
+                                    NULL,
+                                    gpu->id,
+                                    UVM_ID_INVALID,
+                                    fault_entry,
+                                    ++non_replayable_faults->batch_id,
+                                    false);
+
+    if (status != NV_ERR_INVALID_ADDRESS)
+        return status;
+
+    if (uvm_ats_can_service_faults(gpu_va_space, mm)) {
+        ats_invalidate->write_faults_in_batch = false;
+
+        // The VA isn't managed. See if ATS knows about it.
+        status = uvm_ats_service_fault_entry(gpu_va_space, fault_entry, ats_invalidate);
+
+        // Invalidate ATS TLB entries if needed
+        if (status == NV_OK) {
+            status = uvm_ats_invalidate_tlbs(gpu_va_space,
+                                             ats_invalidate,
+                                             &non_replayable_faults->fault_service_tracker);
+        }
+    }
+    else {
+        UVM_ASSERT(fault_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH);
+        fault_entry->is_fatal = true;
+        fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
+
+        // Do not return error due to logical errors in the application
+        status = NV_OK;
+    }
+
+    return status;
+}
+
+static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry)
+{
+    NV_STATUS status;
+    uvm_user_channel_t *user_channel;
+    uvm_va_block_t *va_block;
+    uvm_va_space_t *va_space = NULL;
+    struct mm_struct *mm;
+    uvm_gpu_va_space_t *gpu_va_space;
+    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
+    uvm_va_block_context_t *va_block_context =
+        &gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
+
+    status = uvm_gpu_fault_entry_to_va_space(gpu, fault_entry, &va_space);
+    if (status != NV_OK) {
+        // The VA space lookup will fail if we're running concurrently with
+        // removal of the channel from the VA space (channel unregister, GPU VA
+        // space unregister, VA space destroy, etc). The other thread will stop
+        // the channel and remove the channel from the table, so the faulting
+        // condition will be gone. In the case of replayable faults we need to
+        // flush the buffer, but here we can just ignore the entry and proceed
+        // on.
+        //
+        // Note that we can't have any subcontext issues here, since non-
+        // replayable faults only use the address space of their channel.
+        UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL);
+        UVM_ASSERT(!va_space);
+        return NV_OK;
+    }
+
+    UVM_ASSERT(va_space);
+
+    // If an mm is registered with the VA space, we have to retain it
+    // in order to lock it before locking the VA space. It is guaranteed
+    // to remain valid until we release. If no mm is registered, we
+    // can only service managed faults, not ATS/HMM faults.
+    mm = uvm_va_space_mm_retain_lock(va_space);
+
+    uvm_va_space_down_read(va_space);
+
+    gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
+
+    if (!gpu_va_space) {
+        // The va_space might have gone away. See the comment above.
+        status = NV_OK;
+        goto exit_no_channel;
+    }
+
+    fault_entry->va_space = va_space;
+
+    user_channel = uvm_gpu_va_space_get_user_channel(gpu_va_space, fault_entry->instance_ptr);
+    if (!user_channel) {
+        // The channel might have gone away. See the comment above.
+        status = NV_OK;
+        goto exit_no_channel;
+    }
+
+    fault_entry->fault_source.channel_id = user_channel->hw_channel_id;
+
+    if (!fault_entry->is_fatal) {
+        status = uvm_va_block_find_create(fault_entry->va_space,
+                                          mm,
+                                          fault_entry->fault_address,
+                                          va_block_context,
+                                          &va_block);
+        if (status == NV_OK)
+            status = service_managed_fault_in_block(gpu_va_space->gpu, mm, va_block, fault_entry);
+        else
+            status = service_non_managed_fault(gpu_va_space, mm, fault_entry, status);
+
+        // We are done, we clear the faulted bit on the channel, so it can be
+        // re-scheduled again
+        if (status == NV_OK && !fault_entry->is_fatal) {
+            status = clear_faulted_on_gpu(gpu,
+                                          user_channel,
+                                          fault_entry,
+                                          non_replayable_faults->batch_id,
+                                          &non_replayable_faults->fault_service_tracker);
+            uvm_tracker_clear(&non_replayable_faults->fault_service_tracker);
+        }
+    }
+
+    if (fault_entry->is_fatal)
+        uvm_tools_record_gpu_fatal_fault(gpu->parent->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason);
+
+    if (status != NV_OK || fault_entry->is_fatal)
+        schedule_kill_channel(gpu, fault_entry, user_channel);
+
+exit_no_channel:
+    uvm_va_space_up_read(va_space);
+    uvm_va_space_mm_release_unlock(va_space, mm);
+
+    return status;
+}
+
+void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
+{
+    NV_STATUS status = NV_OK;
+    NvU32 cached_faults;
+
+    // If this handler is modified to handle fewer than all of the outstanding
+    // faults, then special handling will need to be added to uvm_suspend()
+    // to guarantee that fault processing has completed before control is
+    // returned to the RM.
+    while ((cached_faults = fetch_non_replayable_fault_buffer_entries(gpu)) > 0) {
+        NvU32 i;
+
+        // Differently to replayable faults, we do not batch up and preprocess
+        // non-replayable faults since getting multiple faults on the same
+        // memory region is not very likely
+        for (i = 0; i < cached_faults; ++i) {
+            status = service_fault(gpu, &gpu->parent->fault_buffer_info.non_replayable.fault_cache[i]);
+            if (status != NV_OK)
+                break;
+        }
+    }
+
+    if (status != NV_OK)
+        UVM_DBG_PRINT("Error servicing non-replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
+}