515.43.04

2026-02-03 06:57:27 +00:00 · 2022-05-09 13:18:59 -07:00
commit 1739a20efc
2519 changed files with 1060036 additions and 0 deletions
--- a/kernel-open/nvidia-uvm/uvm_pushbuffer.c
+++ b/kernel-open/nvidia-uvm/uvm_pushbuffer.c
@@ -0,0 +1,488 @@
+/*******************************************************************************
+    Copyright (c) 2015-2019 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#include "uvm_api.h"
+#include "uvm_pushbuffer.h"
+#include "uvm_channel.h"
+#include "uvm_global.h"
+#include "uvm_lock.h"
+#include "uvm_procfs.h"
+#include "uvm_push.h"
+#include "uvm_kvmalloc.h"
+#include "uvm_gpu.h"
+#include "uvm_common.h"
+#include "uvm_linux.h"
+
+// Print pushbuffer state into a seq_file if provided or with UVM_DBG_PRINT() if not.
+static void uvm_pushbuffer_print_common(uvm_pushbuffer_t *pushbuffer, struct seq_file *s);
+
+static int nv_procfs_read_pushbuffer_info(struct seq_file *s, void *v)
+{
+    uvm_pushbuffer_t *pushbuffer = (uvm_pushbuffer_t *)s->private;
+
+    if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
+            return -EAGAIN;
+
+    uvm_pushbuffer_print_common(pushbuffer, s);
+
+    uvm_up_read(&g_uvm_global.pm.lock);
+
+    return 0;
+}
+
+static int nv_procfs_read_pushbuffer_info_entry(struct seq_file *s, void *v)
+{
+    UVM_ENTRY_RET(nv_procfs_read_pushbuffer_info(s, v));
+}
+
+UVM_DEFINE_SINGLE_PROCFS_FILE(pushbuffer_info_entry);
+
+static NV_STATUS create_procfs(uvm_pushbuffer_t *pushbuffer)
+{
+    uvm_gpu_t *gpu = pushbuffer->channel_manager->gpu;
+
+    // The pushbuffer info file is for debug only
+    if (!uvm_procfs_is_debug_enabled())
+        return NV_OK;
+
+    pushbuffer->procfs.info_file = NV_CREATE_PROC_FILE("pushbuffer",
+                                                       gpu->procfs.dir,
+                                                       pushbuffer_info_entry,
+                                                       pushbuffer);
+    if (pushbuffer->procfs.info_file == NULL)
+        return NV_ERR_OPERATING_SYSTEM;
+
+    return NV_OK;
+}
+
+NV_STATUS uvm_pushbuffer_create(uvm_channel_manager_t *channel_manager, uvm_pushbuffer_t **pushbuffer_out)
+{
+    NV_STATUS status;
+    int i;
+    uvm_gpu_t *gpu = channel_manager->gpu;
+
+    uvm_pushbuffer_t *pushbuffer = uvm_kvmalloc_zero(sizeof(*pushbuffer));
+    if (pushbuffer == NULL)
+        return NV_ERR_NO_MEMORY;
+
+    pushbuffer->channel_manager = channel_manager;
+
+    uvm_spin_lock_init(&pushbuffer->lock, UVM_LOCK_ORDER_LEAF);
+
+    // Currently the pushbuffer supports UVM_PUSHBUFFER_CHUNKS of concurrent
+    // pushes.
+    uvm_sema_init(&pushbuffer->concurrent_pushes_sema, UVM_PUSHBUFFER_CHUNKS, UVM_LOCK_ORDER_PUSH);
+
+    UVM_ASSERT(channel_manager->conf.pushbuffer_loc == UVM_BUFFER_LOCATION_SYS ||
+               channel_manager->conf.pushbuffer_loc == UVM_BUFFER_LOCATION_VID);
+
+    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                          (channel_manager->conf.pushbuffer_loc == UVM_BUFFER_LOCATION_SYS)?
+                                              UVM_RM_MEM_TYPE_SYS:
+                                              UVM_RM_MEM_TYPE_GPU,
+                                          UVM_PUSHBUFFER_SIZE,
+                                          &pushbuffer->memory);
+    if (status != NV_OK)
+        goto error;
+
+    bitmap_fill(pushbuffer->idle_chunks, UVM_PUSHBUFFER_CHUNKS);
+    bitmap_fill(pushbuffer->available_chunks, UVM_PUSHBUFFER_CHUNKS);
+
+    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
+        INIT_LIST_HEAD(&pushbuffer->chunks[i].pending_gpfifos);
+
+    status = create_procfs(pushbuffer);
+    if (status != NV_OK)
+        goto error;
+
+    *pushbuffer_out = pushbuffer;
+
+    return status;
+
+error:
+    uvm_pushbuffer_destroy(pushbuffer);
+    return status;
+}
+
+static uvm_pushbuffer_chunk_t *get_chunk_in_mask(uvm_pushbuffer_t *pushbuffer, unsigned long *mask)
+{
+    NvU32 index = find_first_bit(mask, UVM_PUSHBUFFER_CHUNKS);
+
+    uvm_assert_spinlock_locked(&pushbuffer->lock);
+
+    if (index == UVM_PUSHBUFFER_CHUNKS)
+        return NULL;
+
+    return &pushbuffer->chunks[index];
+}
+
+static uvm_pushbuffer_chunk_t *get_available_chunk(uvm_pushbuffer_t *pushbuffer)
+{
+    return get_chunk_in_mask(pushbuffer, pushbuffer->available_chunks);
+}
+
+static uvm_pushbuffer_chunk_t *get_idle_chunk(uvm_pushbuffer_t *pushbuffer)
+{
+    return get_chunk_in_mask(pushbuffer, pushbuffer->idle_chunks);
+}
+
+static NvU32 chunk_get_index(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
+{
+    NvU32 index = chunk - pushbuffer->chunks;
+    UVM_ASSERT(index < UVM_PUSHBUFFER_CHUNKS);
+    return index;
+}
+
+static NvU32 chunk_get_offset(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
+{
+    return chunk_get_index(pushbuffer, chunk) * UVM_PUSHBUFFER_CHUNK_SIZE;
+}
+
+static void set_chunk(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk, unsigned long *mask)
+{
+    NvU32 index = chunk_get_index(pushbuffer, chunk);
+
+    uvm_assert_spinlock_locked(&pushbuffer->lock);
+
+    __set_bit(index, mask);
+}
+
+static void clear_chunk(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk, unsigned long *mask)
+{
+    NvU32 index = chunk_get_index(pushbuffer, chunk);
+
+    uvm_assert_spinlock_locked(&pushbuffer->lock);
+
+    __clear_bit(index, mask);
+}
+
+static uvm_pushbuffer_chunk_t *pick_chunk(uvm_pushbuffer_t *pushbuffer)
+{
+    uvm_pushbuffer_chunk_t *chunk = get_idle_chunk(pushbuffer);
+
+    uvm_assert_spinlock_locked(&pushbuffer->lock);
+
+    if (chunk == NULL)
+        chunk = get_available_chunk(pushbuffer);
+
+    return chunk;
+}
+
+static bool try_claim_chunk(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm_pushbuffer_chunk_t **chunk_out)
+{
+    uvm_pushbuffer_chunk_t *chunk;
+
+    uvm_spin_lock(&pushbuffer->lock);
+
+    chunk = pick_chunk(pushbuffer);
+    if (!chunk)
+        goto done;
+
+    chunk->current_push = push;
+    clear_chunk(pushbuffer, chunk, pushbuffer->idle_chunks);
+    clear_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
+
+done:
+    uvm_spin_unlock(&pushbuffer->lock);
+    *chunk_out = chunk;
+
+    return chunk != NULL;
+}
+
+static NvU32 *chunk_get_next_push_start_addr(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
+{
+    char *push_start = (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory);
+    push_start += chunk_get_offset(pushbuffer, chunk);
+    push_start += chunk->next_push_start;
+
+    UVM_ASSERT(((NvU64)push_start) % sizeof(NvU32) == 0);
+
+    return (NvU32*)push_start;
+}
+
+static NV_STATUS claim_chunk(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm_pushbuffer_chunk_t **chunk_out)
+{
+    NV_STATUS status = NV_OK;
+    uvm_channel_manager_t *channel_manager = pushbuffer->channel_manager;
+    uvm_spin_loop_t spin;
+
+    if (try_claim_chunk(pushbuffer, push, chunk_out))
+        return NV_OK;
+
+    uvm_channel_manager_update_progress(channel_manager);
+
+    uvm_spin_loop_init(&spin);
+    while (!try_claim_chunk(pushbuffer, push, chunk_out) && status == NV_OK) {
+        UVM_SPIN_LOOP(&spin);
+        status = uvm_channel_manager_check_errors(channel_manager);
+        uvm_channel_manager_update_progress(channel_manager);
+    }
+
+    return status;
+}
+
+NV_STATUS uvm_pushbuffer_begin_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
+{
+    uvm_pushbuffer_chunk_t *chunk;
+    NV_STATUS status;
+
+    UVM_ASSERT(pushbuffer);
+    UVM_ASSERT(push);
+
+    // Note that this semaphore is uvm_up()ed in end_push().
+    uvm_down(&pushbuffer->concurrent_pushes_sema);
+
+    status = claim_chunk(pushbuffer, push, &chunk);
+    if (status != NV_OK) {
+        uvm_up(&pushbuffer->concurrent_pushes_sema);
+        return status;
+    }
+
+    UVM_ASSERT(chunk);
+
+    push->begin = chunk_get_next_push_start_addr(pushbuffer, chunk);
+    push->next = push->begin;
+
+    return NV_OK;
+}
+
+static uvm_gpfifo_entry_t *chunk_get_first_gpfifo(uvm_pushbuffer_chunk_t *chunk)
+{
+    return list_first_entry_or_null(&chunk->pending_gpfifos, uvm_gpfifo_entry_t, pending_list_node);
+}
+
+static uvm_gpfifo_entry_t *chunk_get_last_gpfifo(uvm_pushbuffer_chunk_t *chunk)
+{
+    return list_last_entry_or_null(&chunk->pending_gpfifos, uvm_gpfifo_entry_t, pending_list_node);
+}
+
+// Get the cpu put within the chunk (in range [0, UVM_PUSHBUFFER_CHUNK_SIZE])
+static NvU32 chunk_get_cpu_put(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
+{
+    uvm_gpfifo_entry_t *gpfifo = chunk_get_last_gpfifo(chunk);
+
+    uvm_assert_spinlock_locked(&pushbuffer->lock);
+
+    if (gpfifo != NULL)
+        return gpfifo->pushbuffer_offset + gpfifo->pushbuffer_size - chunk_get_offset(pushbuffer, chunk);
+    else
+        return 0;
+}
+
+// Get the gpu get within the chunk (in range [0, UVM_PUSHBUFFER_CHUNK_SIZE))
+static NvU32 chunk_get_gpu_get(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
+{
+    uvm_gpfifo_entry_t *gpfifo = chunk_get_first_gpfifo(chunk);
+
+    uvm_assert_spinlock_locked(&pushbuffer->lock);
+
+    if (gpfifo != NULL)
+        return gpfifo->pushbuffer_offset - chunk_get_offset(pushbuffer, chunk);
+    else
+        return 0;
+}
+
+static void update_chunk(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
+{
+    NvU32 gpu_get = chunk_get_gpu_get(pushbuffer, chunk);
+    NvU32 cpu_put = chunk_get_cpu_put(pushbuffer, chunk);
+
+    uvm_assert_spinlock_locked(&pushbuffer->lock);
+
+    if (gpu_get == cpu_put) {
+        // cpu_put can be equal to gpu_get both when the chunk is full and empty. We
+        // can tell apart the cases by checking whether the pending GPFIFOs list is
+        // empty.
+        if (!list_empty(&chunk->pending_gpfifos))
+            return;
+
+        // Chunk completely idle
+        set_chunk(pushbuffer, chunk, pushbuffer->idle_chunks);
+        set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
+        UVM_ASSERT_MSG(cpu_put == 0, "cpu put %u\n", cpu_put);
+
+        // For a completely idle chunk, always start at the very beginning. This
+        // helps avoid the waste that can happen at the very end of the chunk
+        // described at the top of uvm_pushbuffer.h.
+        chunk->next_push_start = 0;
+    }
+    else if (gpu_get > cpu_put) {
+        if (gpu_get - cpu_put >= UVM_MAX_PUSH_SIZE) {
+            // Enough space between put and get
+            set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
+            chunk->next_push_start = cpu_put;
+        }
+    }
+    else if (UVM_PUSHBUFFER_CHUNK_SIZE >= cpu_put + UVM_MAX_PUSH_SIZE) {
+        UVM_ASSERT_MSG(gpu_get < cpu_put, "gpu_get %u cpu_put %u\n", gpu_get, cpu_put);
+
+        // Enough space at the end
+        set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
+        chunk->next_push_start = cpu_put;
+    }
+    else if (gpu_get >= UVM_MAX_PUSH_SIZE) {
+        UVM_ASSERT_MSG(gpu_get < cpu_put, "gpu_get %u cpu_put %u\n", gpu_get, cpu_put);
+
+        // Enough space at the beginning
+        set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
+        chunk->next_push_start = 0;
+    }
+}
+
+void uvm_pushbuffer_destroy(uvm_pushbuffer_t *pushbuffer)
+{
+    if (pushbuffer == NULL)
+        return;
+
+    uvm_procfs_destroy_entry(pushbuffer->procfs.info_file);
+
+    uvm_rm_mem_free(pushbuffer->memory);
+    uvm_kvfree(pushbuffer);
+}
+
+static uvm_pushbuffer_chunk_t *offset_to_chunk(uvm_pushbuffer_t *pushbuffer, NvU32 offset)
+{
+    UVM_ASSERT(offset < UVM_PUSHBUFFER_SIZE);
+    return &pushbuffer->chunks[offset / UVM_PUSHBUFFER_CHUNK_SIZE];
+}
+
+static uvm_pushbuffer_chunk_t *gpfifo_to_chunk(uvm_pushbuffer_t *pushbuffer, uvm_gpfifo_entry_t *gpfifo)
+{
+    uvm_pushbuffer_chunk_t *chunk = offset_to_chunk(pushbuffer, gpfifo->pushbuffer_offset);
+    UVM_ASSERT(offset_to_chunk(pushbuffer, gpfifo->pushbuffer_offset + gpfifo->pushbuffer_size - 1) == chunk);
+    return chunk;
+}
+
+void uvm_pushbuffer_mark_completed(uvm_pushbuffer_t *pushbuffer, uvm_gpfifo_entry_t *gpfifo)
+{
+    uvm_pushbuffer_chunk_t *chunk = gpfifo_to_chunk(pushbuffer, gpfifo);
+    uvm_push_info_t *push_info = gpfifo->push_info;
+    bool need_to_update_chunk = false;
+
+    if (push_info->on_complete != NULL)
+        push_info->on_complete(push_info->on_complete_data);
+
+    push_info->on_complete = NULL;
+    push_info->on_complete_data = NULL;
+
+    uvm_spin_lock(&pushbuffer->lock);
+
+    if (gpfifo == chunk_get_first_gpfifo(chunk))
+        need_to_update_chunk = true;
+    else if (gpfifo == chunk_get_last_gpfifo(chunk))
+        need_to_update_chunk = true;
+
+    list_del(&gpfifo->pending_list_node);
+
+    // If current_push is not NULL, updating the chunk is delayed till
+    // uvm_pushbuffer_end_push() is called for that push.
+    if (need_to_update_chunk && chunk->current_push == NULL)
+        update_chunk(pushbuffer, chunk);
+
+    uvm_spin_unlock(&pushbuffer->lock);
+}
+
+NvU32 uvm_pushbuffer_get_offset_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
+{
+    NvU32 offset = (char*)push->begin - (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory);
+
+    UVM_ASSERT(((NvU64)offset) % sizeof(NvU32) == 0);
+
+    return offset;
+}
+
+NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
+{
+    NvU64 pushbuffer_base;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+    bool is_proxy_channel = uvm_channel_is_proxy(push->channel);
+
+    pushbuffer_base = uvm_rm_mem_get_gpu_va(pushbuffer->memory, gpu, is_proxy_channel);
+
+    return pushbuffer_base + uvm_pushbuffer_get_offset_for_push(pushbuffer, push);
+}
+
+void uvm_pushbuffer_end_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm_gpfifo_entry_t *gpfifo)
+{
+    uvm_pushbuffer_chunk_t *chunk = gpfifo_to_chunk(pushbuffer, gpfifo);
+
+    uvm_assert_spinlock_locked(&push->channel->pool->lock);
+
+    uvm_spin_lock(&pushbuffer->lock);
+
+    list_add_tail(&gpfifo->pending_list_node, &chunk->pending_gpfifos);
+
+    update_chunk(pushbuffer, chunk);
+
+    UVM_ASSERT(chunk->current_push == push);
+    chunk->current_push = NULL;
+
+    uvm_spin_unlock(&pushbuffer->lock);
+
+    // uvm_pushbuffer_end_push() needs to be called with the channel lock held
+    // while the concurrent pushes sema has a higher lock order. To keep the
+    // code structure simple, just up out of order here.
+    uvm_up_out_of_order(&pushbuffer->concurrent_pushes_sema);
+}
+
+bool uvm_pushbuffer_has_space(uvm_pushbuffer_t *pushbuffer)
+{
+    bool has_space;
+
+    uvm_spin_lock(&pushbuffer->lock);
+
+    has_space = pick_chunk(pushbuffer) != NULL;
+
+    uvm_spin_unlock(&pushbuffer->lock);
+
+    return has_space;
+}
+
+void uvm_pushbuffer_print_common(uvm_pushbuffer_t *pushbuffer, struct seq_file *s)
+{
+    NvU32 i;
+
+    UVM_SEQ_OR_DBG_PRINT(s, "Pushbuffer for GPU %s\n", uvm_gpu_name(pushbuffer->channel_manager->gpu));
+    UVM_SEQ_OR_DBG_PRINT(s, " has space: %d\n", uvm_pushbuffer_has_space(pushbuffer));
+
+    uvm_spin_lock(&pushbuffer->lock);
+
+    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i) {
+        uvm_pushbuffer_chunk_t *chunk = &pushbuffer->chunks[i];
+        NvU32 cpu_put = chunk_get_cpu_put(pushbuffer, chunk);
+        NvU32 gpu_get = chunk_get_gpu_get(pushbuffer, chunk);
+        UVM_SEQ_OR_DBG_PRINT(s, " chunk %u put %u get %u next %u available %d idle %d\n",
+                i,
+                cpu_put, gpu_get, chunk->next_push_start,
+                test_bit(i, pushbuffer->available_chunks) ? 1 : 0,
+                test_bit(i, pushbuffer->idle_chunks) ? 1 : 0);
+
+    }
+
+    uvm_spin_unlock(&pushbuffer->lock);
+}
+
+void uvm_pushbuffer_print(uvm_pushbuffer_t *pushbuffer)
+{
+    return uvm_pushbuffer_print_common(pushbuffer, NULL);
+}