535.43.24

2026-02-07 00:29:58 +00:00 · 2024-01-31 14:02:06 -08:00
parent 2a3b58b8c8
commit e558660fc2
267 changed files with 89045 additions and 82824 deletions
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@@ -146,9 +146,15 @@ static void fake_tlb_invals_disable(void)
    g_fake_tlb_invals_tracking_enabled = false;
 }

-// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
-static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
-        NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
+// Fake TLB invalidate VA that just saves off the parameters so that they can be
+// verified later.
+static void fake_tlb_invalidate_va(uvm_push_t *push,
+                                   uvm_gpu_phys_address_t pdb,
+                                   NvU32 depth,
+                                   NvU64 base,
+                                   NvU64 size,
+                                   NvU32 page_size,
+                                   uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
        return;
@@ -210,8 +216,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
    }
    if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
        UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
-                expected_membar ? "a" : "no",
-                uvm_membar_string(g_last_fake_inval->membar));
+                       expected_membar ? "a" : "no",
+                       uvm_membar_string(g_last_fake_inval->membar));
        result = false;
    }

@@ -230,7 +236,8 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
    }
    if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
-                g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
+                       g_last_fake_inval->base,
+                       g_last_fake_inval->base + g_last_fake_inval->size);
        return false;
    }
    if (g_last_fake_inval->depth != expected_depth) {
@@ -247,15 +254,16 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
-        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
-                base, base + size);
+        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
        return false;
    }

    if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
-                base, base + size,
-                inval->base, inval->base + inval->size);
+                        base,
+                        base + size,
+                        inval->base,
+                        inval->base + inval->size);
        return false;
    }
    if (inval->depth != expected_depth) {
@@ -270,7 +278,13 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    return true;
 }

-static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
+static bool assert_invalidate_range(NvU64 base,
+                                    NvU64 size,
+                                    NvU32 page_size,
+                                    bool allow_inval_all,
+                                    NvU32 range_depth,
+                                    NvU32 all_depth,
+                                    bool expected_membar)
 {
    NvU32 i;

@@ -488,7 +502,6 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
    return NV_OK;
 }

-
 static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@@ -842,6 +855,7 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
    TEST_CHECK_RET(range2.entry_count == 256);
    TEST_CHECK_RET(range2.table->ref_count == 512);
    TEST_CHECK_RET(range1.table == range2.table);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range1.start_index == 256);
@@ -871,6 +885,7 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
    TEST_CHECK_RET(range64k.entry_count == 16);
    TEST_CHECK_RET(range64k.table->ref_count == 16);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range64k.start_index == 16);
@@ -1030,10 +1045,13 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)

    // Depth 4
    NvU64 extent_pte = UVM_PAGE_SIZE_2M;
+
    // Depth 3
    NvU64 extent_pde0 = extent_pte * (1ull << 8);
+
    // Depth 2
    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
+
    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

@@ -1081,7 +1099,11 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
+static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
+                                                 NvU64 base,
+                                                 NvU64 size,
+                                                 NvU32 min_page_size,
+                                                 NvU32 max_page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_push_t push;
@@ -1205,7 +1227,11 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
            NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
            if (*pte != expected_pte) {
                UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
-                        *pte, expected_pte, offset, range_vec->start, range_vec->size);
+                               *pte,
+                               expected_pte,
+                               offset,
+                               range_vec->start,
+                               range_vec->size);
                return false;
            }
            offset += range_vec->page_size;
@@ -1226,7 +1252,11 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
    TEST_CHECK_RET(data.status == NV_OK);
    TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
    TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
-            range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
+                                                    range_vec->start,
+                                                    range_vec->size,
+                                                    range_vec->page_size,
+                                                    page_table_depth,
+                                                    membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));

    fake_tlb_invals_disable();
@@ -1249,7 +1279,11 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
    return NV_OK;
 }

-static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
+static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
+                                       NvU64 start,
+                                       NvU64 size,
+                                       NvU32 page_size,
+                                       uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@@ -1544,25 +1578,28 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
    uvm_mmu_mode_hal_t *hal;
+    uvm_page_directory_t dir;
    NvU32 i, j, big_page_size, page_size;

+    dir.depth = 0;
+
    for (i = 0; i < ARRAY_SIZE(big_page_sizes); i++) {
        big_page_size = big_page_sizes[i];
        hal = gpu->parent->arch_hal->mmu_mode_hal(big_page_size);

        memset(phys_allocs, 0, sizeof(phys_allocs));

-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x0L);

        phys_allocs[0] = &alloc_sys;
        phys_allocs[1] = &alloc_vid;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);

        phys_allocs[0] = &alloc_vid;
        phys_allocs[1] = &alloc_sys;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);

        for (j = 0; j <= 2; j++) {
@@ -1632,38 +1669,47 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
+    uvm_page_directory_t dir;
+
    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

+    dir.index_in_parent = 0;
+    dir.host_parent = NULL;
+    dir.depth = 0;
+
    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    dir.depth = 0;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@@ -1719,6 +1765,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
+    uvm_page_directory_t dir;

    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
@@ -1726,37 +1773,45 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

+    dir.index_in_parent = 0;
+    dir.host_parent = NULL;
+    dir.depth = 0;
+
    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    dir.depth = 0;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // NO_ATS PDE1 (depth 2)
    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 2);
+    dir.depth = 2;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
    else
@@ -1791,104 +1846,203 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent

 static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
+    NV_STATUS status = NV_OK;
    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
+    uvm_page_directory_t *dirs[5];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBB000LL);

-    // big versions have [11:8] set as well to test the page table merging
+    // Big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

-    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0);
+    memset(dirs, 0, sizeof(dirs));
+    // Fake directory tree.
+    for (i = 0; i < ARRAY_SIZE(dirs); i++) {
+        dirs[i] = uvm_kvmalloc_zero(sizeof(uvm_page_directory_t) + sizeof(dirs[i]->entries[0]) * 512);
+        TEST_CHECK_GOTO(dirs[i] != NULL, cleanup);
+
+        dirs[i]->depth = i;
+        dirs[i]->index_in_parent = 0;
+
+        if (i == 0)
+            dirs[i]->host_parent = NULL;
+        else
+            dirs[i]->host_parent = dirs[i - 1];
+    }
+
+    // Make sure cleared PDEs work as expected.
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0, cleanup);

    // Cleared PDEs work as expected for big and small PDEs.
    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0 && pde_bits[1] == 0, cleanup);

    // Sys and vidmem PDEs, uncached ATS allowed.
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBB00A, cleanup);

-    // Dual PDEs, uncached.
+    // Dual PDEs, uncached. We don't use child_dir in the depth 4 checks because
+    // our policy decides the PDE's PCF without using it.
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    if (g_uvm_global.ats.enabled)
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A, cleanup);
+    else
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999990C && pde_bits[1] == 0xBBBBBBB00A, cleanup);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    if (g_uvm_global.ats.enabled)
+        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C, cleanup);
+    else
+        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB0A && pde_bits[1] == 0x999999999900C, cleanup);
+
+    // We only need to test make_pde() on ATS when the CPU VA width < GPU's.
+    if (g_uvm_global.ats.enabled && uvm_cpu_num_va_bits() < hal->num_va_bits()) {
+        phys_allocs[0] = &alloc_sys;
+
+        dirs[1]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 1;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 1);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 2;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 511;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 511);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[1]->index_in_parent = 1;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 1);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 509;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 510;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        phys_allocs[0] = NULL;
+
+        dirs[1]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+
+        dirs[2]->index_in_parent = 2;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
+
+        dirs[1]->index_in_parent = 1;
+        dirs[2]->index_in_parent = 509;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
+
+        dirs[2]->index_in_parent = 510;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+    }

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
    // access counters disabled.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D,
+                    cleanup);

    // change to cached.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
-                   0x9999999999685);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
+                                  0x9999999999685,
+                    cleanup);

    // enable access counters.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605,
+                    cleanup);

    // remove atomic
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645,
+                    cleanup);

    // read only
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665,
+                    cleanup);

    // local video
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_VID,
-                                 0xBBBBBBB000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_VID,
+                                  0xBBBBBBB000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661,
+                    cleanup);

    // peer 1
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_1,
-                                 0xBBBBBBB000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_PEER_1,
+                                  0xBBBBBBB000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663,
+                    cleanup);

    // sparse
-    TEST_CHECK_RET(hal->make_sparse_pte() == 0x8);
+    TEST_CHECK_GOTO(hal->make_sparse_pte() == 0x8, cleanup);

    // sked reflected
-    TEST_CHECK_RET(hal->make_sked_reflected_pte() == 0xF09);
+    TEST_CHECK_GOTO(hal->make_sked_reflected_pte() == 0xF09, cleanup);

    num_page_sizes = get_page_sizes(gpu, page_sizes);

    for (i = 0; i < num_page_sizes; i++)
-        TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));
+        TEST_NV_CHECK_GOTO(entry_test_page_size(gpu, page_sizes[i]), cleanup);

-    return NV_OK;
+cleanup:
+    for (i = 0; i < ARRAY_SIZE(dirs); i++)
+        uvm_kvfree(dirs[i]);
+
+    return status;
 }

 static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
@@ -2303,7 +2457,8 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    gpu->parent = parent_gpu;

    // At least test_tlb_invalidates() relies on global state
-    // (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
+    // (g_tlb_invalidate_*) so make sure only one test instance can run at a
+    // time.
    uvm_mutex_lock(&g_uvm_global.global_lock);

    // Allocate the fake TLB tracking state. Notably tests still need to enable
@@ -2311,7 +2466,13 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    // calls.
    TEST_NV_CHECK_GOTO(fake_tlb_invals_alloc(), done);

-    TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
+    // We prevent the maxwell_test_page_tree test from running on ATS-enabled
+    // systems. On "fake" Maxwell-based ATS systems pde_fill() may push more
+    // methods than what we support in UVM. Specifically, on
+    // uvm_page_tree_init() which eventually calls phys_mem_init(). On Maxwell,
+    // upper PDE levels have more than 512 entries.
+    if (!g_uvm_global.ats.enabled)
+        TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(pascal_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(volta_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(ampere_test_page_tree(gpu), done);