525.53

2026-04-23 15:58:56 +00:00 · 2022-11-10 08:39:33 -08:00
parent 7c345b838b
commit 758b4ee818
1323 changed files with 262135 additions and 60754 deletions
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -249,7 +249,7 @@ struct uvm_va_block_struct
    // Lock protecting the block. See the comment at the top of uvm.c.
    uvm_mutex_t lock;

-    // Parent VA range. UVM managed blocks have this set. HMM blocks will have
+    // Parent VA range. Managed blocks have this set. HMM blocks will have
    // va_range set to NULL and hmm.va_space set instead. Dead blocks that are
    // waiting for the last ref count to be removed have va_range and
    // hmm.va_space set to NULL (could be either type of block).
@@ -437,13 +437,22 @@ struct uvm_va_block_struct

    uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];

+    // Prefetch infomation that is updated while holding the va_block lock but
+    // records state while the lock is not held.
+    struct
+    {
+        uvm_processor_id_t last_migration_proc_id;
+
+        NvU16 fault_migrations_to_last_proc;
+    } prefetch_info;
+
 #if UVM_IS_CONFIG_HMM()
    struct
    {
        // The MMU notifier is registered per va_block.
        struct mmu_interval_notifier notifier;

-        // Parent VA space pointer. It is NULL for UVM managed blocks or if
+        // Parent VA space pointer. It is NULL for managed blocks or if
        // the HMM block is dead. This field can be read while holding the
        // block lock and is only modified while holding the va_space write
        // lock and va_block lock (same as the va_range pointer).
@@ -488,21 +497,27 @@ struct uvm_va_block_wrapper_struct
        // uvm_cpu_chunk_allocation_sizes module parameter.
        NvU32 cpu_chunk_allocation_size_mask;

-        // Force the next eviction attempt on this block to fail. Used for
-        // testing only.
-        bool inject_eviction_error;
-
        // Subsequent operations that need to allocate CPU pages will fail. As
-        // opposed to other error injection settings, this one is persistent.
+        // opposed to other error injection settings, this one fails N times
+        // and then succeeds instead of failing on the Nth try. A value of ~0u
+        // means fail indefinitely.
        // This is because this error is supposed to be fatal and tests verify
        // the state of the VA blocks after the failure. However, some tests
        // use kernels to trigger migrations and a fault replay could trigger
        // a successful migration if this error flag is cleared.
-        bool inject_cpu_pages_allocation_error;
+        NvU32 inject_cpu_pages_allocation_error_count;
+
+        // Force the next eviction attempt on this block to fail. Used for
+        // testing only.
+        bool inject_eviction_error;

        // Force the next successful chunk allocation to then fail. Used for testing
        // only to simulate driver metadata allocation failure.
        bool inject_populate_error;
+
+        // Force the next split on this block to fail.
+        // Set by error injection ioctl for testing purposes only.
+        bool inject_split_error;
    } test;
 };

@@ -639,8 +654,18 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
        memset(va_block_context, 0xff, sizeof(*va_block_context));

    va_block_context->mm = mm;
+#if UVM_IS_CONFIG_HMM()
+    va_block_context->hmm.vma = NULL;
+#endif
 }

+// Check that a single policy covers the given region for the given va_block.
+// This always returns true and is intended to only be used with UVM_ASSERT().
+// Locking: the va_block lock must be held.
+bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
+                                        uvm_va_policy_t *policy,
+                                        uvm_va_block_region_t region);
+
 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
 //       and page masks could simplify the below APIs and their implementations
 //       at the cost of having to scan the whole mask for small regions.
@@ -651,8 +676,10 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
 // pages in the region which are present in the mask.
 //
 // prefetch_page_mask may be passed as a subset of page_mask when cause is
-// UVM_MAKE_RESIDENT_CAUSE_FAULT to indicate pages that have been pulled due
-// to automatic page prefetching heuristics. For pages in this mask,
+// UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT,
+// UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT, or
+// UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER to indicate pages that have been
+// pulled due to automatic page prefetching heuristics. For pages in this mask,
 // UVM_MAKE_RESIDENT_CAUSE_PREFETCH will be reported in migration events,
 // instead.
 //
@@ -674,20 +701,24 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
 // block's lock has been unlocked and relocked as part of the call and that the
 // whole sequence of operations performed under the block's lock needs to be
 // attempted again. To facilitate that, the caller needs to provide the same
-// va_block_retry struct for each attempt that has been initialized before the first
-// attempt and needs to be deinitialized after the last one. Most callers can
-// just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the caller.
+// va_block_retry struct for each attempt that has been initialized before the
+// first attempt and needs to be deinitialized after the last one. Most callers
+// can just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the
+// caller.
 //
 // If dest_id is the CPU then va_block_retry can be NULL and allocation-retry of
 // user memory is guaranteed not to happen. Allocation-retry of page tables can
 // still occur though.
 //
-// va_block_context must be non-NULL. This function will set a bit in
+// va_block_context must not be NULL. This function will set a bit in
 // va_block_context->make_resident.pages_changed_residency for each page that
 // changed residency (due to a migration or first population) as a result of the
 // operation. This function only sets bits in that mask. It is the caller's
 // responsiblity to zero the mask or not first.
 //
+// va_block_context->policy must also be set by the caller for the given region.
+// See the comments for uvm_va_block_check_policy_is_valid().
+//
 // Notably any status other than NV_OK indicates that the block's lock might
 // have been unlocked and relocked.
 //
@@ -710,6 +741,8 @@ NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
 //   where they are unmapped
 // - All remote mappings (due to either SetAccessedBy or performance heuristics)
 //   are broken
+// - Only managed va_blocks are supported.
+//   TODO: Bug 3660922: need to implement HMM read duplication support.
 // - LOCKING: If va_block_context->mm != NULL, va_block_context->mm->mmap_lock
 //            must be held in at least read mode.
 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
@@ -721,6 +754,34 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
                                                    const uvm_page_mask_t *prefetch_page_mask,
                                                    uvm_make_resident_cause_t cause);

+// Similar to uvm_va_block_make_resident() (read documentation there). The
+// difference is that source pages are only copied to the destination and the
+// residency is not updated until uvm_va_block_make_resident_post() is called.
+// Otherwise, the combination of uvm_va_block_make_resident_pre() and
+// uvm_va_block_make_resident_post() should be the same as just calling
+// uvm_va_block_make_resident().
+// This split is needed when using migrate_vma_setup() and migrate_vma_pages()
+// so that when migrate_vma_pages() indicates a page is not migrating, the
+// va_block state is not updated.
+// LOCKING: The caller must hold the va_block lock.
+NV_STATUS uvm_va_block_make_resident_pre(uvm_va_block_t *va_block,
+                                         uvm_va_block_retry_t *va_block_retry,
+                                         uvm_va_block_context_t *va_block_context,
+                                         uvm_processor_id_t dest_id,
+                                         uvm_va_block_region_t region,
+                                         const uvm_page_mask_t *page_mask,
+                                         const uvm_page_mask_t *prefetch_page_mask,
+                                         uvm_make_resident_cause_t cause);
+
+// The page_mask must be the same or a subset of the page_mask passed to
+// uvm_va_block_make_resident_pre(). This step updates the residency and breaks
+// read duplication.
+// LOCKING: The caller must hold the va_block lock.
+void uvm_va_block_make_resident_post(uvm_va_block_t *va_block,
+                                     uvm_va_block_context_t *va_block_context,
+                                     uvm_va_block_region_t region,
+                                     const uvm_page_mask_t *page_mask);
+
 // Creates or upgrades a mapping from the input processor to the given virtual
 // address region. Pages which already have new_prot permissions or higher are
 // skipped, so this call ensures that the range is mapped with at least new_prot
@@ -749,7 +810,8 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
 // pages because the earlier operation can cause a PTE split or merge which is
 // assumed by the later operation.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // If allocation-retry was required as part of the operation and was successful,
 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
@@ -805,7 +867,7 @@ NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
 // pages because the earlier operation can cause a PTE split or merge which is
 // assumed by the later operation.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL. The va_block_context->policy is unused.
 //
 // If allocation-retry was required as part of the operation and was successful,
 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
@@ -837,12 +899,20 @@ NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
 //   up-to-date data.
 // - Unmap the preferred location's processor from any pages in this region
 //   which are not resident on the preferred location.
+//
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
+//
 // LOCKING: The caller must hold the VA block lock.
 NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
                                                     uvm_va_block_context_t *va_block_context);

 // Maps the given processor to all resident pages in this block, as allowed by
 // location and policy. Waits for the operation to complete before returning.
+// This function should only be called with managed va_blocks.
+//
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
@@ -852,8 +922,10 @@ NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
                                       uvm_processor_id_t processor_id);

 // Breaks SetAccessedBy and remote mappings
+// This function should only be called with managed va_blocks.
 //
-// va_block_context must NOT be NULL
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
@@ -862,8 +934,10 @@ NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
                                            uvm_va_block_context_t *va_block_context);

 // Restores SetAccessedBy mappings
+// This function should only be called with managed va_blocks.
 //
-// va_block_context must NOT be NULL
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
@@ -871,6 +945,29 @@ NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
 NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
                                              uvm_va_block_context_t *va_block_context);

+// Check if processor_id is allowed to access the va_block with access_type
+// permissions. Return values:
+//
+// NV_ERR_INVALID_ADDRESS       The VA block is logically dead (zombie)
+// NV_ERR_INVALID_ACCESS_TYPE   The vma corresponding to the VA range does not
+//                              allow access_type permissions, or migration is
+//                              disallowed and processor_id cannot access the
+//                              range remotely (UVM-Lite).
+// NV_ERR_INVALID_OPERATION     The access would violate the policies specified
+//                              by UvmPreventMigrationRangeGroups.
+//
+// va_block_context must not be NULL, va_block_context->policy must be valid,
+// and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
+// which also means the va_block_context->mm is not NULL, retained, and locked
+// for at least read.
+// Locking: the va_block lock must be held.
+NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
+                                                 uvm_va_block_context_t *va_block_context,
+                                                 uvm_processor_id_t processor_id,
+                                                 uvm_page_index_t page_index,
+                                                 uvm_fault_type_t access_type,
+                                                 bool allow_migration);
+
 // API for access privilege revocation
 //
 // Revoke prot_to_revoke access permissions for the given processor.
@@ -898,7 +995,7 @@ NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
 // different pages because the earlier operation can cause a PTE split or merge
 // which is assumed by the later operation.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL. The va_block_context->policy is unused.
 //
 // If allocation-retry was required as part of the operation and was successful,
 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
@@ -938,7 +1035,8 @@ NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
 // processor_id, which triggered the migration and should have already been
 // mapped).
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // This function acquires/waits for the va_block tracker and updates that
 // tracker with any new work pushed.
@@ -968,7 +1066,8 @@ NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
 // uvm_va_block_map() indicating that the operation needs to be retried.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
@@ -989,6 +1088,8 @@ NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_spa
 // If mm != NULL, that mm is used for any CPU mappings which may be created as
 // a result of this call. See uvm_va_block_context_t::mm for details.
 //
+// va_block_context must not be NULL. The va_block_context->policy is unused.
+//
 // LOCKING: The caller must hold the va_block lock. If block_context->mm is not
 // NULL, the caller must hold mm->mmap_lock in at least read mode.
 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
@@ -1057,10 +1158,7 @@ NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
 // Exactly the same split semantics as uvm_va_block_split, including error
 // handling except the existing_va_block block lock needs to be held and
 // the new_va_block has to be preallocated.
-//
-// new_va_block's va_range is set to new_va_range before any reverse mapping is
-// established to the new block, but the caller is responsible for inserting the
-// new block into the range.
+// Also note that the existing_va_block lock may be dropped and re-acquired.
 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
                                    NvU64 new_end,
                                    uvm_va_block_t *new_va_block,
@@ -1076,6 +1174,7 @@ NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
 //  - va_space lock must be held in at least read mode
 //
 // service_context->block_context.mm is ignored and vma->vm_mm is used instead.
+// service_context->block_context.policy is set by this function.
 //
 // Returns NV_ERR_INVALID_ACCESS_TYPE if a CPU mapping to fault_addr cannot be
 // accessed, for example because it's within a range group which is non-
@@ -1089,6 +1188,8 @@ NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
 // (migrations, cache invalidates, etc.) in response to the given service block
 // context
 //
+// service_context->block_context.policy is set by this function.
+//
 // Locking:
 //  - service_context->block_context.mm->mmap_lock must be held in at least
 //    read mode, if valid.
@@ -1132,10 +1233,18 @@ static inline NvU64 uvm_va_block_cpu_page_address(uvm_va_block_t *block, uvm_pag
    return block->start + PAGE_SIZE * page_index;
 }

+// Get the physical address on the given GPU for given residency
+uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
+                                                          uvm_page_index_t page_index,
+                                                          uvm_processor_id_t residency,
+                                                          uvm_gpu_t *gpu);
+
 // Get the page physical address on the given GPU
 //
 // This will assert that GPU state is indeed present.
-uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_gpu_t *gpu);
+uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
+                                                          uvm_page_index_t page_index,
+                                                          uvm_gpu_t *gpu);

 static bool uvm_va_block_contains_address(uvm_va_block_t *block, NvU64 address)
 {
@@ -1191,26 +1300,28 @@ NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t

 // Same as uvm_va_block_find except that the block is created if not found.
 // If addr is covered by a UVM_VA_RANGE_TYPE_MANAGED va_range, a managed block
-// will be created. Otherwise, if addr is not covered by any va_range, mm is
-// non-NULL, and HMM is enabled in the va_space, an HMM block will be created.
-// In either case, if mm is non-NULL, it must be retained and locked in at
-// least read mode. Return values:
+// will be created. Otherwise, if addr is not covered by any va_range, HMM is
+// enabled in the va_space, and va_block_context and va_block_context->mm are
+// non-NULL, then a HMM block will be created and va_block_context->hmm.vma is
+// set to the VMA covering 'addr'. The va_block_context->policy field is left
+// unchanged.
+// In either case, if va_block_context->mm is non-NULL, it must be retained and
+// locked in at least read mode. Return values:
 // NV_ERR_INVALID_ADDRESS   addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
 //                          a HMM enabled VMA.
 // NV_ERR_NO_MEMORY         memory could not be allocated.
 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
-                                   struct mm_struct *mm,
                                   NvU64 addr,
                                   uvm_va_block_context_t *va_block_context,
                                   uvm_va_block_t **out_block);

-// Same as uvm_va_block_find_create except that only UVM managed va_blocks are
+// Same as uvm_va_block_find_create except that only managed va_blocks are
 // created if not already present in the VA range.
 static NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
                                                  NvU64 addr,
                                                  uvm_va_block_t **out_block)
 {
-    return uvm_va_block_find_create(va_space, NULL, addr, NULL, out_block);
+    return uvm_va_block_find_create(va_space, addr, NULL, out_block);
 }

 // Look up a chunk backing a specific address within the VA block. Returns NULL if none.
@@ -1232,7 +1343,8 @@ typedef enum
 // The caller needs to handle allocation-retry. va_block_retry can be NULL if
 // the destination is the CPU.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: The caller must hold the va_block lock. If va_block_context->mm !=
 //          NULL, va_block_context->mm->mmap_lock must be held in at least
@@ -1249,6 +1361,9 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
 //
 // The [dst, dst + size) range has to fit within a single PAGE_SIZE page.
 //
+// va_block_context must not be NULL. The caller is not required to set
+// va_block_context->policy.
+//
 // The caller needs to support allocation-retry of page tables.
 //
 // LOCKING: The caller must hold the va_block lock
@@ -1317,6 +1432,8 @@ void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block);
 // successful, NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case the
 // block's lock was unlocked and relocked.
 //
+// va_block_context must not be NULL. The va_block_context->policy is unused.
+//
 // LOCKING: The caller must hold the va_block lock.
 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu);

@@ -1396,6 +1513,26 @@ static uvm_va_block_region_t uvm_va_block_region_from_block(uvm_va_block_t *va_b
    return uvm_va_block_region(0, uvm_va_block_num_cpu_pages(va_block));
 }

+// Create a block region from a va block and page mask. Note that the region
+// covers the first through the last set bit and may have unset bits in between.
+static uvm_va_block_region_t uvm_va_block_region_from_mask(uvm_va_block_t *va_block, const uvm_page_mask_t *page_mask)
+{
+    uvm_va_block_region_t region;
+    uvm_page_index_t outer = uvm_va_block_num_cpu_pages(va_block);
+
+    region.first = find_first_bit(page_mask->bitmap, outer);
+    if (region.first >= outer) {
+        region = uvm_va_block_region(0, 0);
+    }
+    else {
+        // At least one bit is set so find_last_bit() should not return 'outer'.
+        region.outer = find_last_bit(page_mask->bitmap, outer) + 1;
+        UVM_ASSERT(region.outer <= outer);
+    }
+
+    return region;
+}
+
 static bool uvm_page_mask_test(const uvm_page_mask_t *mask, uvm_page_index_t page_index)
 {
    UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
@@ -1715,61 +1852,6 @@ static NvU64 uvm_reverse_map_end(const uvm_reverse_map_t *reverse_map)
 #define for_each_va_block_page(page_index, va_block)                                         \
    for_each_va_block_page_in_region((page_index), uvm_va_block_region_from_block(va_block))

-static void uvm_va_block_bitmap_tree_init_from_page_count(uvm_va_block_bitmap_tree_t *bitmap_tree, size_t page_count)
-{
-    bitmap_tree->leaf_count  = page_count;
-    bitmap_tree->level_count = ilog2(roundup_pow_of_two(page_count)) + 1;
-    uvm_page_mask_zero(&bitmap_tree->pages);
-}
-
-static void uvm_va_block_bitmap_tree_init(uvm_va_block_bitmap_tree_t *bitmap_tree, uvm_va_block_t *va_block)
-{
-    size_t num_pages = uvm_va_block_num_cpu_pages(va_block);
-    uvm_va_block_bitmap_tree_init_from_page_count(bitmap_tree, num_pages);
-}
-
-static void uvm_va_block_bitmap_tree_iter_init(const uvm_va_block_bitmap_tree_t *bitmap_tree,
-                                               uvm_page_index_t page_index,
-                                               uvm_va_block_bitmap_tree_iter_t *iter)
-{
-    UVM_ASSERT(bitmap_tree->level_count > 0);
-    UVM_ASSERT_MSG(page_index < bitmap_tree->leaf_count,
-                   "%zd vs %zd",
-                   (size_t)page_index,
-                   (size_t)bitmap_tree->leaf_count);
-
-    iter->level_idx = bitmap_tree->level_count - 1;
-    iter->node_idx  = page_index;
-}
-
-static uvm_va_block_region_t uvm_va_block_bitmap_tree_iter_get_range(const uvm_va_block_bitmap_tree_t *bitmap_tree,
-                                                                     const uvm_va_block_bitmap_tree_iter_t *iter)
-{
-    NvU16 range_leaves = uvm_perf_tree_iter_leaf_range(bitmap_tree, iter);
-    NvU16 range_start = uvm_perf_tree_iter_leaf_range_start(bitmap_tree, iter);
-    uvm_va_block_region_t subregion = uvm_va_block_region(range_start, range_start + range_leaves);
-
-    UVM_ASSERT(iter->level_idx >= 0);
-    UVM_ASSERT(iter->level_idx < bitmap_tree->level_count);
-
-    return subregion;
-}
-
-static NvU16 uvm_va_block_bitmap_tree_iter_get_count(const uvm_va_block_bitmap_tree_t *bitmap_tree,
-                                                     const uvm_va_block_bitmap_tree_iter_t *iter)
-{
-    uvm_va_block_region_t subregion = uvm_va_block_bitmap_tree_iter_get_range(bitmap_tree, iter);
-
-    return uvm_page_mask_region_weight(&bitmap_tree->pages, subregion);
-}
-
-#define uvm_va_block_bitmap_tree_traverse_counters(counter,tree,page,iter)                             \
-    for (uvm_va_block_bitmap_tree_iter_init((tree), (page), (iter)),                                   \
-         (counter) = uvm_va_block_bitmap_tree_iter_get_count((tree), (iter));                          \
-         (iter)->level_idx >= 0;                                                                       \
-         (counter) = --(iter)->level_idx < 0? 0:                                                       \
-                                              uvm_va_block_bitmap_tree_iter_get_count((tree), (iter)))
-
 // Return the block region covered by the given chunk size. page_index must be
 // any page within the block known to be covered by the chunk.
 static uvm_va_block_region_t uvm_va_block_chunk_region(uvm_va_block_t *block,
@@ -1898,6 +1980,12 @@ uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
 // returned.
 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);

+// Returns the largest sub-region region of 'region' which can fit big pages.
+// If the region cannot fit any big pages, an invalid region (0, 0) is returned.
+uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
+                                                          uvm_va_block_region_t region,
+                                                          NvU32 big_page_size);
+
 // Returns the big page index (the bit index within
 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
 // page_index cannot be covered by a big PTE due to alignment or block size,
@@ -1907,7 +1995,14 @@ size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t pa
 // Returns the new residency for a page that faulted or triggered access
 // counter notifications. The read_duplicate output parameter indicates if the
 // page meets the requirements to be read-duplicated
+// va_block_context must not be NULL, va_block_context->policy must be valid,
+// and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
+// which also means the va_block_context->mm is not NULL, retained, and locked
+// for at least read. See the comments for uvm_va_block_check_policy_is_valid()
+// and uvm_hmm_va_block_context_vma_is_valid() in uvm_hmm.h.
+// Locking: the va_block lock must be held.
 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
+                                                 uvm_va_block_context_t *va_block_context,
                                                 uvm_page_index_t page_index,
                                                 uvm_processor_id_t processor_id,
                                                 NvU32 access_type_mask,