580.82.07

2026-02-10 18:19:58 +00:00 · 2025-09-02 10:35:52 -07:00
parent 288f16e614
commit 6387af3092
67 changed files with 1665 additions and 838 deletions
--- a/kernel-open/nvidia-uvm/uvm_blackwell.c
+++ b/kernel-open/nvidia-uvm/uvm_blackwell.c
@@ -143,8 +143,13 @@ void uvm_hal_blackwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    // by UVM.
    if (parent_gpu->rm_info.gpuArch == NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100 &&
        parent_gpu->rm_info.gpuImplementation ==
-            NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GB10B)
+            NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GB10B) {
        parent_gpu->is_integrated_gpu = true;
+        // GB10B has sticky L2 coherent cache lines.
+        // For details, refer to the comments in uvm_gpu.h
+        // where this field is declared.
+        parent_gpu->sticky_l2_coherent_cache_lines = true;
+    }
    if (parent_gpu->rm_info.gpuArch == NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB200 &&
        parent_gpu->rm_info.gpuImplementation ==
            NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GB20B)
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -1132,6 +1132,15 @@ struct uvm_parent_gpu_struct
    // by UVM.
    bool is_integrated_gpu;

+    // True if the GPU has sticky L2 coherent cache lines that prevent
+    // caching of system memory. GB10B experiences "sticky" lines.
+    // Bug 4577236 outlines the issue. Essentially normal eviction of coherent
+    // cache lines is prevented, causing "sticky" lines that persist until
+    // invalidate/snoop. This limits L2 cache availability and can cause
+    // cross-context interference. This is fixed in GB20B/GB20C. This field is
+    // set for specific GPU implementations that have this limitation i.e. GB10B.
+    bool sticky_l2_coherent_cache_lines;
+
    struct
    {
        // If true, the granularity of key rotation is a single channel. If
@@ -1560,6 +1569,14 @@ static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);

 // Returns a GPU peer pair index in the range [0 .. UVM_MAX_UNIQUE_GPU_PAIRS).
+
+static bool uvm_parent_gpu_supports_full_coherence(uvm_parent_gpu_t *parent_gpu)
+{
+    // TODO: Bug 5310178: Replace this with the value returned by RM to check
+    // if the GPU supports full coherence.
+    return parent_gpu->is_integrated_gpu;
+}
+
 NvU32 uvm_gpu_pair_index(const uvm_gpu_id_t id0, const uvm_gpu_id_t id1);

 // Either retains an existing PCIe peer entry or creates a new one. In both
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@@ -330,6 +330,9 @@ static struct page *uvm_migrate_vma_alloc_page(migrate_vma_state_t *state)
        }
    }

+    if (!dst_page)
+        state->out_of_memory = true;
+
    return dst_page;
 }

@@ -489,7 +492,11 @@ static NV_STATUS uvm_migrate_vma_populate_anon_pages(struct vm_area_struct *vma,

    // Pre-allocate the dst pages and mark the ones that failed
    for_each_set_bit(i, page_mask, state->num_pages) {
-        struct page *dst_page = uvm_migrate_vma_alloc_page(state);
+        struct page *dst_page = NULL;
+
+        if (!state->out_of_memory)
+            dst_page = uvm_migrate_vma_alloc_page(state);
+
        if (!dst_page) {
            __set_bit(i, state->allocation_failed_mask.page_mask);
            continue;
@@ -734,7 +741,11 @@ static NV_STATUS uvm_migrate_vma_copy_pages_from(struct vm_area_struct *vma,

    // Pre-allocate the dst pages and mark the ones that failed
    for_each_set_bit(i, page_mask, state->num_pages) {
-        struct page *dst_page = uvm_migrate_vma_alloc_page(state);
+        struct page *dst_page = NULL;
+
+        if (!state->out_of_memory)
+            dst_page = uvm_migrate_vma_alloc_page(state);
+
        if (!dst_page) {
            __set_bit(i, state->allocation_failed_mask.page_mask);
            continue;
@@ -1486,7 +1497,7 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
        uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(gpu);
    }

-    state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS);
+    state = nv_kmem_cache_zalloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS);
    if (!state)
        return NV_ERR_NO_MEMORY;

--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@@ -177,6 +177,9 @@ typedef struct

    // Number of pages that are directly populated on the destination
    unsigned long num_populate_anon_pages;
+
+    // Tracks if OOM condition was encountered.
+    bool out_of_memory;
 } migrate_vma_state_t;

 #if defined(CONFIG_MIGRATE_VMA_HELPER)
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -406,7 +406,10 @@ static void chunk_pin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 {
    uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);

-    uvm_assert_spinlock_locked(&pmm->list_lock);
+    // The PMM list_lock must be held, but calling uvm_assert_spinlock_locked()
+    // is not possible here due to the absence of the UVM context pointer in
+    // the interrupt context when called from devmem_page_free().
+
    UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
    chunk->state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;

@@ -434,7 +437,6 @@ static void chunk_unpin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_pmm_gpu_

    uvm_assert_spinlock_locked(&pmm->list_lock);
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
-    UVM_ASSERT(chunk->va_block == NULL);
    UVM_ASSERT(chunk_is_root_chunk_pinned(pmm, chunk));
    UVM_ASSERT(new_state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);

@@ -609,8 +611,6 @@ NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
        return status;

    for (i = 0; i < num_chunks; ++i) {
-        UVM_ASSERT(chunks[i]->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
-
        uvm_spin_lock(&pmm->list_lock);
        chunk_unpin(pmm, chunks[i], UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
        chunks[i]->is_referenced = false;
@@ -656,45 +656,29 @@ static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk
        list_del_init(&chunk->list);
 }

-static void gpu_unpin_temp(uvm_pmm_gpu_t *pmm,
-                           uvm_gpu_chunk_t *chunk,
-                           uvm_va_block_t *va_block,
-                           bool is_referenced)
+void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
 {
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
    UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));
-
-    INIT_LIST_HEAD(&chunk->list);
+    UVM_ASSERT(list_empty(&chunk->list));
+    UVM_ASSERT(va_block);
+    UVM_ASSERT(chunk->va_block == va_block);
+    UVM_ASSERT(chunk->va_block_page_index < uvm_va_block_num_cpu_pages(va_block));

    uvm_spin_lock(&pmm->list_lock);

-    UVM_ASSERT(!chunk->va_block);
-    UVM_ASSERT(va_block);
-    UVM_ASSERT(chunk->va_block_page_index < uvm_va_block_num_cpu_pages(va_block));
-
    chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-    chunk->is_referenced = is_referenced;
-    chunk->va_block = va_block;
    chunk_update_lists_locked(pmm, chunk);

    uvm_spin_unlock(&pmm->list_lock);
 }

-void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
-{
-    gpu_unpin_temp(pmm, chunk, va_block, false);
-}
-
-void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
-{
-    gpu_unpin_temp(pmm, chunk, va_block, true);
-}
-
 void uvm_pmm_gpu_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker)
 {
    NV_STATUS status;

-    if (!chunk)
+    // Referenced chunks are freed by Linux when the reference is released.
+    if (!chunk || chunk->is_referenced)
        return;

    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
@@ -760,6 +744,10 @@ static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
    size_t i;

    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
+    UVM_ASSERT_MSG(chunk->suballoc->allocated == num_subchunks(chunk),
+                   "%u != %u\n",
+                   chunk->suballoc->allocated,
+                   num_subchunks(chunk));
    UVM_ASSERT(first_child->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
               first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);

@@ -778,16 +766,6 @@ static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        }
    }

-    if (first_child->state == UVM_PMM_GPU_CHUNK_STATE_FREE) {
-        UVM_ASSERT(chunk->suballoc->allocated == 0);
-    }
-    else {
-        UVM_ASSERT_MSG(chunk->suballoc->allocated == num_subchunks(chunk),
-                       "%u != %u\n",
-                       chunk->suballoc->allocated,
-                       num_subchunks(chunk));
-    }
-
    return true;
 }

@@ -826,6 +804,7 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
    else if (child_state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
        UVM_ASSERT(root_chunk->chunk.suballoc->pinned_leaf_chunks >= num_sub);
        root_chunk->chunk.suballoc->pinned_leaf_chunks += 1 - num_sub;
+        chunk->va_block = subchunk->va_block;
    }

    chunk->state = child_state;
@@ -849,7 +828,7 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        UVM_ASSERT(list_empty(&subchunk->list));

        if ((child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) && uvm_gpu_chunk_is_user(subchunk))
-            UVM_ASSERT(subchunk->va_block != NULL);
+            UVM_ASSERT(subchunk->va_block);

        kmem_cache_free(CHUNK_CACHE, subchunk);
    }
@@ -1216,7 +1195,7 @@ void uvm_pmm_gpu_mark_chunk_evicted(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)

    UVM_ASSERT(chunk_is_in_eviction(pmm, chunk));
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-    UVM_ASSERT(chunk->va_block != NULL);
+    UVM_ASSERT(chunk->va_block);

    chunk->va_block = NULL;
    chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
@@ -1885,9 +1864,9 @@ static void init_root_chunk(uvm_pmm_gpu_t *pmm,
                   uvm_pmm_gpu_chunk_state_string(chunk->state),
                   uvm_gpu_name(gpu));

-    UVM_ASSERT(chunk->parent == NULL);
-    UVM_ASSERT(chunk->suballoc == NULL);
-    UVM_ASSERT(chunk->va_block == NULL);
+    UVM_ASSERT(!chunk->parent);
+    UVM_ASSERT(!chunk->suballoc);
+    UVM_ASSERT(!chunk->va_block);
    UVM_ASSERT(chunk->va_block_page_index == PAGES_PER_UVM_VA_BLOCK);
    UVM_ASSERT(list_empty(&chunk->list));
    UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
@@ -2145,6 +2124,9 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
            subchunk->va_block_page_index = chunk->va_block_page_index + (i * subchunk_size) / PAGE_SIZE;
            subchunk->is_referenced = chunk->is_referenced;
        }
+        else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
+            subchunk->va_block = chunk->va_block;
+        }
    }

    // We're splitting an allocated or pinned chunk in-place.
@@ -2170,6 +2152,10 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        // accounting for the root chunk itself so add the 1 back.
        if (chunk_is_root_chunk(chunk))
            root_chunk->chunk.suballoc->pinned_leaf_chunks += 1;
+
+        chunk->va_block = NULL;
+        chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
+        chunk->is_referenced = false;
    }

    chunk->state = UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT;
@@ -2251,16 +2237,16 @@ static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)

    if (root_chunk->chunk.in_eviction) {
        // A root chunk with pinned subchunks would never be picked for eviction
-        // so this one has to be in the allocated state. Pin it and let the
-        // evicting thread pick it up.
-        UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
-        UVM_ASSERT(chunk->va_block != NULL);
-        UVM_ASSERT(chunk->va_block_page_index != PAGES_PER_UVM_VA_BLOCK);
-        UVM_ASSERT(list_empty(&chunk->list));
-        chunk->va_block = NULL;
-        chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
-        chunk->is_zero = false;
-        chunk_pin(pmm, chunk);
+        // but HMM evictions will end up here so leave the chunk pinned (or pin
+        // it) and let the eviction thread pick it up.
+        if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
+            UVM_ASSERT(chunk->va_block);
+            UVM_ASSERT(list_empty(&chunk->list));
+            chunk->va_block = NULL;
+            chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
+            chunk->is_zero = false;
+            chunk_pin(pmm, chunk);
+        }
        return;
    }

@@ -2274,17 +2260,15 @@ static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        }
    }

-    if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
-        chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_FREE);
-    }
-    else {
-        chunk->state = UVM_PMM_GPU_CHUNK_STATE_FREE;
-        chunk->va_block = NULL;
-    }
-
+    chunk->va_block = NULL;
    chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
    chunk->is_zero = false;

+    if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED)
+        chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_FREE);
+    else
+        chunk->state = UVM_PMM_GPU_CHUNK_STATE_FREE;
+
    chunk_update_lists_locked(pmm, chunk);
 }

@@ -3117,6 +3101,11 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
            break;
        }

+        if (page->zone_device_data) {
+            ret = false;
+            break;
+        }
+
        if (page_count(page)) {
            ret = false;
            break;
@@ -3128,12 +3117,17 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)

 static void devmem_page_free(struct page *page)
 {
-    uvm_va_space_t *va_space = uvm_pmm_devmem_page_to_va_space(page);
    uvm_gpu_chunk_t *chunk = uvm_pmm_devmem_page_to_chunk(page);
    uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(chunk);

-    atomic64_dec(&va_space->hmm.allocated_page_count);
-    UVM_ASSERT(atomic64_read(&va_space->hmm.allocated_page_count) >= 0);
+    if (chunk->va_block) {
+        uvm_va_space_t *va_space = chunk->va_block->hmm.va_space;
+
+        UVM_ASSERT(va_space);
+        atomic64_dec(&va_space->hmm.allocated_page_count);
+        UVM_ASSERT(atomic64_read(&va_space->hmm.allocated_page_count) >= 0);
+    }
+
    page->zone_device_data = NULL;

    // We should be calling free_chunk() except that it acquires a mutex and
@@ -3143,7 +3137,20 @@ static void devmem_page_free(struct page *page)
    spin_lock(&gpu->pmm.list_lock.lock);

    UVM_ASSERT(chunk->is_referenced);
+
+    chunk->va_block = NULL;
+    chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
    chunk->is_referenced = false;
+
+    if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
+        list_del_init(&chunk->list);
+        chunk_pin(&gpu->pmm, chunk);
+    }
+    else {
+        UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
+        UVM_ASSERT(list_empty(&chunk->list));
+    }
+
    list_add_tail(&chunk->list, &gpu->pmm.root_chunks.va_block_lazy_free);

    spin_unlock(&gpu->pmm.list_lock.lock);
@@ -3430,6 +3437,7 @@ static void process_lazy_free(uvm_pmm_gpu_t *pmm)
    // is empty.
    while (!list_empty(&pmm->root_chunks.va_block_lazy_free)) {
        chunk = list_first_entry(&pmm->root_chunks.va_block_lazy_free, uvm_gpu_chunk_t, list);
+        UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
        list_del_init(&chunk->list);
        uvm_spin_unlock(&pmm->list_lock);

@@ -3587,9 +3595,9 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)

    gpu = uvm_pmm_to_gpu(pmm);

-    UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
    nv_kthread_q_flush(&gpu->parent->lazy_free_q);
    UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
+    UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
    release_free_root_chunks(pmm);

    if (gpu->mem_info.size != 0 && gpu_supports_pma_eviction(gpu))
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@@ -264,6 +264,11 @@ struct uvm_gpu_chunk_struct

        // This flag indicates an allocated user chunk is referenced by a device
        // private struct page PTE and therefore expects a page_free() callback.
+        // The flag is only for sanity checking since uvm_pmm_gpu_free()
+        // shouldn't be called if Linux has a device private reference to this
+        // chunk and devmem_page_free() should only be called from the Linux
+        // callback if a reference was created.
+        // See uvm_hmm_va_block_service_locked() and fill_dst_pfn() for details.
        //
        // This field is always false in kernel chunks.
        bool is_referenced : 1;
@@ -293,6 +298,9 @@ struct uvm_gpu_chunk_struct
    // The VA block using the chunk, if any.
    // User chunks that are not backed by a VA block are considered to be
    // temporarily pinned and cannot be evicted.
+    // Note that the chunk state is normally UVM_PMM_GPU_CHUNK_STATE_ALLOCATED
+    // but can also be UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED if an HMM va_block
+    // and device private struct page have a pointer to this chunk.
    //
    // This field is always NULL in kernel chunks.
    uvm_va_block_t *va_block;
@@ -441,17 +449,16 @@ NvU64 uvm_gpu_chunk_to_sys_addr(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
 // Allocates num_chunks chunks of size chunk_size in caller-supplied array
 // (chunks).
 //
-// Returned chunks are in the TEMP_PINNED state, requiring a call to either
-// uvm_pmm_gpu_unpin_allocated, uvm_pmm_gpu_unpin_referenced, or
-// uvm_pmm_gpu_free. If a tracker is passed in, all
-// the pending operations on the allocated chunks will be added to it
+// Returned chunks are in the TEMP_PINNED state, requiring a call to
+// uvm_pmm_gpu_unpin_allocated or uvm_pmm_gpu_free. If a tracker is passed in,
+// all the pending operations on the allocated chunks will be added to it
 // guaranteeing that all the entries come from the same GPU as the PMM.
 // Otherwise, when tracker is NULL, all the pending operations will be
 // synchronized before returning to the caller.
 //
 // Each of the allocated chunks list nodes (uvm_gpu_chunk_t::list) can be used
-// by the caller until the chunk is unpinned (uvm_pmm_gpu_unpin_allocated,
-// uvm_pmm_gpu_unpin_referenced) or freed (uvm_pmm_gpu_free). If used, the list
+// by the caller until the chunk is unpinned (uvm_pmm_gpu_unpin_allocated)
+// or freed (uvm_pmm_gpu_free). If used, the list
 // node has to be returned to a valid state before calling either of the APIs.
 //
 // In case of an error, the chunks array is guaranteed to be cleared.
@@ -484,12 +491,6 @@ NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
 // Can only be used on user memory.
 void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);

-// Unpin a temporarily pinned chunk, set its reverse map to a VA block, and
-// mark it as referenced.
-//
-// Can only be used on user memory.
-void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);
-
 // Free a user or kernel chunk. Temporarily pinned chunks are unpinned.
 //
 // The tracker is optional and a NULL tracker indicates that no new operation
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -124,6 +124,18 @@ uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block)
    return va_space;
 }

+// Check if the GPU should cache system memory. This depends on whether the GPU
+// supports full coherence and whether it has sticky L2 coherent cache lines.
+// Also, if the module parameter is set, we will cache system memory.
+static bool gpu_should_cache_sysmem(uvm_gpu_t *gpu)
+{
+    if (uvm_parent_gpu_supports_full_coherence(gpu->parent) &&
+        !gpu->parent->sticky_l2_coherent_cache_lines) {
+        return true;
+    }
+    return uvm_exp_gpu_cache_sysmem != 0;
+}
+
 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
@@ -135,7 +147,7 @@ static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu,
        return UVM_MMU_PTE_FLAGS_CACHED;

    if (UVM_ID_IS_CPU(resident_id))
-        return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
+        return gpu_should_cache_sysmem(gpu) ? UVM_MMU_PTE_FLAGS_CACHED : UVM_MMU_PTE_FLAGS_NONE;

    UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id));

@@ -420,11 +432,13 @@ static uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page_resident(uvm_va_block_t
    return chunk;
 }

-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
+                                     uvm_cpu_chunk_t *chunk,
+                                     int nid,
+                                     uvm_page_index_t page_index)
 {
    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
    uvm_cpu_chunk_storage_mixed_t *mixed;
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
    uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
    size_t slot_index;
    uvm_cpu_chunk_t **chunks;
@@ -759,7 +773,7 @@ static bool block_check_cpu_chunks(uvm_va_block_t *block)
    int nid;
    uvm_page_mask_t *temp_resident_mask;

-    temp_resident_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS | __GFP_ZERO);
+    temp_resident_mask = nv_kmem_cache_zalloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);

    for_each_possible_uvm_node(nid) {
        uvm_cpu_chunk_t *chunk;
@@ -821,16 +835,16 @@ void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_b
        uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
    }

+    // HMM should have already moved allocated GPU chunks to the referenced
+    // state or freed them.
+    if (uvm_va_block_is_hmm(va_block))
+        UVM_ASSERT(list_empty(&retry->used_chunks));
+
    // Unpin all the used chunks now that we are done
    list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) {
        list_del_init(&gpu_chunk->list);
        gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
-        // HMM should have already moved allocated blocks to the referenced
-        // state so any left over were not migrated and should be freed.
-        if (uvm_va_block_is_hmm(va_block))
-            uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
-        else
-            uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
+        uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
    }
 }

@@ -1152,6 +1166,8 @@ static size_t block_gpu_chunk_index(uvm_va_block_t *block,
        UVM_ASSERT(gpu_state->chunks);
        chunk = gpu_state->chunks[index];
        if (chunk) {
+            UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));
+            UVM_ASSERT(uvm_id_equal(uvm_gpu_id_from_index(chunk->gpu_index), gpu->id));
            UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size);
            UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
            UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE);
@@ -1390,10 +1406,7 @@ error:
    return status;
 }

-// Retrieves the gpu_state for the given GPU. The returned pointer is
-// internally managed and will be allocated (and freed) automatically,
-// rather than by the caller.
-static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
+uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
 {
    NV_STATUS status;
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
@@ -1425,22 +1438,6 @@ error:
    return NULL;
 }

-NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
-{
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
-    uvm_gpu_id_t gpu_id;
-
-    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
-    uvm_assert_mutex_locked(&va_block->lock);
-
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) {
-        if (!block_gpu_state_get_alloc(va_block, uvm_gpu_get(gpu_id)))
-            return NV_ERR_NO_MEMORY;
-    }
-
-    return NV_OK;
-}
-
 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
                                          uvm_cpu_chunk_t *chunk)
 {
@@ -1495,7 +1492,7 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
            uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
            uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
            uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
-            uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
+            uvm_cpu_chunk_remove_from_block(va_block, chunk, nid, page_index);
            uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
            uvm_cpu_chunk_free(chunk);
        }
@@ -1591,26 +1588,6 @@ static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
    return status;
 }

-// Same as block_alloc_cpu_chunk() but allocate a chunk suitable for use as
-// a HMM destination page. The main difference is UVM does not own the reference
-// on the struct page backing these chunks.
-static NV_STATUS block_alloc_hmm_cpu_chunk(uvm_va_block_t *block,
-                                           uvm_chunk_sizes_mask_t cpu_allocation_sizes,
-                                           uvm_cpu_chunk_alloc_flags_t flags,
-                                           int nid,
-                                           uvm_cpu_chunk_t **chunk)
-{
-    NV_STATUS status;
-
-    UVM_ASSERT(uvm_va_block_is_hmm(block));
-
-    status = block_alloc_cpu_chunk(block, cpu_allocation_sizes, flags, nid, chunk);
-    if (status == NV_OK)
-        (*chunk)->type = UVM_CPU_CHUNK_TYPE_HMM;
-
-    return status;
-}
-
 // Find the largest allocation size we can use for the given page_index in the
 // given block. Returns the mask of possible sizes and region covered by the
 // largest. Callers may also elect to use a smaller size.
@@ -1842,7 +1819,7 @@ static NV_STATUS block_add_cpu_chunk(uvm_va_block_t *block,

        status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk);
        if (status != NV_OK) {
-            uvm_cpu_chunk_remove_from_block(block, uvm_cpu_chunk_get_numa_node(chunk), page_index);
+            uvm_cpu_chunk_remove_from_block(block, chunk, uvm_cpu_chunk_get_numa_node(chunk), page_index);
            goto out;
        }
    }
@@ -1866,8 +1843,7 @@ out:
 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
                                          const uvm_page_mask_t *populate_page_mask,
                                          uvm_va_block_region_t populate_region,
-                                          uvm_va_block_context_t *block_context,
-                                          bool staged)
+                                          uvm_va_block_context_t *block_context)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_chunk_t *chunk;
@@ -1965,13 +1941,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
        if (!uvm_page_mask_region_full(resident_mask, region))
            chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;

-        // Management of a page used for a staged migration is never handed off
-        // to the kernel and is really just a driver managed page. Therefore
-        // don't allocate a HMM chunk in this case.
-        if (uvm_va_block_is_hmm(block) && !staged)
-            status = block_alloc_hmm_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
-        else
-            status = block_alloc_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
+        status = block_alloc_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);

        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
            alloc_flags &= ~UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
@@ -1991,11 +1961,6 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,

        // Skip iterating over all pages covered by the allocated chunk.
        page_index = region.outer - 1;
-
-#if UVM_IS_CONFIG_HMM()
-        if (uvm_va_block_is_hmm(block) && block_context)
-            block_context->hmm.dst_pfns[page_index] = migrate_pfn(page_to_pfn(chunk->page));
-#endif
    }

    return NV_OK;
@@ -2003,7 +1968,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,

 NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_va_block_context_t *block_context)
 {
-    return block_populate_pages_cpu(va_block, NULL, uvm_va_block_region_for_page(page_index), block_context, false);
+    return block_populate_pages_cpu(va_block, NULL, uvm_va_block_region_for_page(page_index), block_context);
 }

 // Try allocating a chunk. If eviction was required,
@@ -2392,7 +2357,7 @@ static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm
    if (UVM_ID_IS_CPU(processor))
        return uvm_va_block_resident_mask_get(block, processor, nid);

-    gpu_state = block_gpu_state_get_alloc(block, uvm_gpu_get(processor));
+    gpu_state = uvm_va_block_gpu_state_get_alloc(block, uvm_gpu_get(processor));
    if (!gpu_state)
        return NULL;

@@ -2432,9 +2397,15 @@ void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
        return;
    }

+    uvm_page_mask_zero(out_mask);
    uvm_page_mask_region_fill(out_mask, region);

-    for_each_id_in_mask(id, &va_block->mapped) {
+    // UVM-HMM doesn't always know when CPU pages are mapped or not since there
+    // is no notification when CPU page tables are upgraded. If the page is
+    // resident, assume the CPU has some mapping.
+    uvm_page_mask_andnot(out_mask, out_mask, uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
+
+    for_each_gpu_id_in_mask(id, &va_block->mapped) {
        uvm_page_mask_andnot(out_mask, out_mask, uvm_va_block_map_mask_get(va_block, id));
    }
 }
@@ -2931,7 +2902,7 @@ static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
                                          size_t chunk_index,
                                          uvm_va_block_region_t chunk_region)
 {
-    uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
+    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get_alloc(block, gpu);
    uvm_gpu_chunk_t *chunk = NULL;
    uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region);
    uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
@@ -2974,11 +2945,11 @@ static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
    if (status != NV_OK)
        goto chunk_free;

-    // An allocation retry might cause another thread to call UvmDiscard when it drops
-    // the block lock. As a result, inconsistent residency bits would trigger redundant
-    // zeroing of the chunk. Current implementation chooses to do the redundant zeroing
-    // as for better performance tradeoffs (tracking to avoid this case could cost more)
-    // and security.
+    // An allocation retry might cause another thread to call UvmDiscard when it
+    // drops the block lock. As a result, inconsistent residency bits would
+    // trigger redundant zeroing of the chunk. Current implementation chooses
+    // to do the redundant zeroing as for better performance tradeoffs
+    // (tracking to avoid this case could cost more) and security.
    status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker);
    if (status != NV_OK)
        goto chunk_unmap;
@@ -3002,8 +2973,10 @@ static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
    }

    // Record the used chunk so that it can be unpinned at the end of the whole
-    // operation.
+    // operation. HMM chunks are unpinned after a successful migration.
    block_retry_add_used_chunk(retry, chunk);
+
+    chunk->va_block = block;
    gpu_state->chunks[chunk_index] = chunk;

    return NV_OK;
@@ -3020,12 +2993,13 @@ chunk_free:
 }

 // Populate all chunks which cover the given region and page mask.
-static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
+NV_STATUS uvm_va_block_populate_pages_gpu(uvm_va_block_t *block,
                                          uvm_va_block_retry_t *retry,
-                                          uvm_gpu_t *gpu,
+                                          uvm_gpu_id_t gpu_id,
                                          uvm_va_block_region_t region,
                                          const uvm_page_mask_t *populate_mask)
 {
+    uvm_gpu_t *gpu = uvm_gpu_get(gpu_id);
    uvm_va_block_region_t chunk_region, check_region;
    size_t chunk_index;
    uvm_page_index_t page_index;
@@ -3102,7 +3076,7 @@ static NV_STATUS block_populate_pages(uvm_va_block_t *block,
        if (!tmp_processor_mask)
            return NV_ERR_NO_MEMORY;

-        status = block_populate_pages_gpu(block, retry, uvm_gpu_get(dest_id), region, populate_page_mask);
+        status = uvm_va_block_populate_pages_gpu(block, retry, dest_id, region, populate_page_mask);
        if (status != NV_OK) {
            uvm_processor_mask_cache_free(tmp_processor_mask);
            return status;
@@ -3150,7 +3124,7 @@ static NV_STATUS block_populate_pages(uvm_va_block_t *block,
    }

    uvm_memcg_context_start(&memcg_context, block_context->mm);
-    status = block_populate_pages_cpu(block, cpu_populate_mask, region, block_context, UVM_ID_IS_GPU(dest_id));
+    status = block_populate_pages_cpu(block, cpu_populate_mask, region, block_context);
    uvm_memcg_context_end(&memcg_context);
    return status;
 }
@@ -4199,7 +4173,7 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,

                // Ensure that there is GPU state that can be used for CPU-to-CPU copies
                if (UVM_ID_IS_CPU(dst_id) && uvm_id_equal(src_id, dst_id)) {
-                    uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, copying_gpu);
+                    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get_alloc(block, copying_gpu);
                    if (!gpu_state) {
                        status = NV_ERR_NO_MEMORY;
                        break;
@@ -4893,6 +4867,7 @@ static void block_cleanup_temp_pinned_gpu_chunks(uvm_va_block_t *va_block, uvm_g
        // block_populate_pages above. Release them since the copy
        // failed and they won't be mapped to userspace.
        if (chunk && chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
+            list_del_init(&chunk->list);
            uvm_mmu_chunk_unmap(chunk, &va_block->tracker);
            uvm_pmm_gpu_free(&gpu->pmm, chunk, &va_block->tracker);
            gpu_state->chunks[i] = NULL;
@@ -4993,7 +4968,8 @@ NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
                                       prefetch_page_mask,
                                       UVM_VA_BLOCK_TRANSFER_MODE_MOVE);

-    if (status != NV_OK) {
+    // HMM does its own clean up.
+    if (status != NV_OK && !uvm_va_block_is_hmm(va_block)) {
        if (UVM_ID_IS_GPU(dest_id))
            block_cleanup_temp_pinned_gpu_chunks(va_block, dest_id);

@@ -7971,7 +7947,7 @@ static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
    gpu = gpu_va_space->gpu;
    big_page_size = gpu_va_space->page_tables.big_page_size;

-    gpu_state = block_gpu_state_get_alloc(block, gpu);
+    gpu_state = uvm_va_block_gpu_state_get_alloc(block, gpu);
    if (!gpu_state)
        return NV_ERR_NO_MEMORY;

@@ -8705,12 +8681,12 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,

        gpu = uvm_gpu_get(id);

-        // Although this GPU UUID is registered in the VA space, it might not have a
-        // GPU VA space registered.
+        // Although this GPU UUID is registered in the VA space, it might not
+        // have a GPU VA space registered.
        if (!uvm_gpu_va_space_get(va_space, gpu))
            return NV_OK;

-        gpu_state = block_gpu_state_get_alloc(va_block, gpu);
+        gpu_state = uvm_va_block_gpu_state_get_alloc(va_block, gpu);
        if (!gpu_state)
            return NV_ERR_NO_MEMORY;

@@ -9760,7 +9736,7 @@ static void block_kill(uvm_va_block_t *block)
            if (!uvm_va_block_is_hmm(block))
                uvm_cpu_chunk_mark_dirty(chunk, 0);

-            uvm_cpu_chunk_remove_from_block(block, nid, page_index);
+            uvm_cpu_chunk_remove_from_block(block, chunk, nid, page_index);
            uvm_cpu_chunk_free(chunk);
        }

@@ -9824,13 +9800,12 @@ void uvm_va_block_kill(uvm_va_block_t *va_block)
 static void block_gpu_release_region(uvm_va_block_t *va_block,
                                     uvm_gpu_id_t gpu_id,
                                     uvm_va_block_gpu_state_t *gpu_state,
-                                     uvm_page_mask_t *page_mask,
                                     uvm_va_block_region_t region)
 {
    uvm_page_index_t page_index;
    uvm_gpu_t *gpu = uvm_gpu_get(gpu_id);

-    for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
+    for_each_va_block_page_in_region(page_index, region) {
        size_t chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL);
        uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[chunk_index];

@@ -9875,7 +9850,7 @@ void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
            uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id);

        if (gpu_state->chunks) {
-            block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region);
+            block_gpu_release_region(va_block, gpu_id, gpu_state, region);

            // TODO: bug 3660922: Need to update the read duplicated pages mask
            // when read duplication is supported for HMM.
@@ -10446,7 +10421,7 @@ static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_
        if (status != NV_OK)
            goto error;

-        if (!block_gpu_state_get_alloc(new, gpu)) {
+        if (!uvm_va_block_gpu_state_get_alloc(new, gpu)) {
            status = NV_ERR_NO_MEMORY;
            goto error;
        }
@@ -10620,7 +10595,7 @@ static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
            uvm_page_index_t new_chunk_page_index;
            NV_STATUS status;

-            uvm_cpu_chunk_remove_from_block(existing, nid, page_index);
+            uvm_cpu_chunk_remove_from_block(existing, chunk, nid, page_index);

            // The chunk has to be adjusted for the new block before inserting it.
            new_chunk_page_index = page_index - split_page_index;
@@ -13532,7 +13507,7 @@ out:

 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
 {
-    uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
+    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get_alloc(block, gpu);
    uvm_push_t push;
    NV_STATUS status;

--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -1375,9 +1375,11 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
                                      uvm_va_block_t *va_block,
                                      uvm_service_block_context_t *service_context);

-// Allocate GPU state for the given va_block and registered GPUs.
+// Returns the gpu_state for the given GPU. The returned pointer is
+// internally managed and will be allocated (and freed) automatically,
+// rather than by the caller. Returns NULL if there is no memory.
 // Locking: The block lock must be held.
-NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block);
+uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get_alloc(uvm_va_block_t *va_block, uvm_gpu_t *gpu);

 // Release any GPU or policy data associated with the given region in response
 // to munmap().
@@ -2152,10 +2154,13 @@ bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, u
 // Locking: The va_block lock must be held.
 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

-// Remove a CPU chunk at the given page_index from the va_block.
+// Remove the given CPU chunk at the given page_index from the va_block.
 // nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
+                                     uvm_cpu_chunk_t *chunk,
+                                     int nid,
+                                     uvm_page_index_t page_index);

 // Return the CPU chunk at the given page_index on the given NUMA node from the
 // va_block. nid cannot be NUMA_NO_NODE.
@@ -2288,6 +2293,13 @@ NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block,
                                         uvm_page_index_t page_index,
                                         uvm_va_block_context_t *block_context);

+// Populate all GPU chunks which cover the given region and page mask.
+NV_STATUS uvm_va_block_populate_pages_gpu(uvm_va_block_t *block,
+                                          uvm_va_block_retry_t *retry,
+                                          uvm_gpu_id_t gpu_id,
+                                          uvm_va_block_region_t region,
+                                          const uvm_page_mask_t *populate_mask);
+
 // A helper macro for handling allocation-retry
 //
 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call