diff --git a/test/ck_tile/tensor_descriptor/test_tensor_descriptor.cpp b/test/ck_tile/tensor_descriptor/test_tensor_descriptor.cpp index 395eeb2e40..f65d161bcc 100644 --- a/test/ck_tile/tensor_descriptor/test_tensor_descriptor.cpp +++ b/test/ck_tile/tensor_descriptor/test_tensor_descriptor.cpp @@ -42,62 +42,6 @@ constexpr auto make_blocked_tensor_descriptor() return desc_4d; } -void debug_print_explicit(const std::vector& data, index_t MWidth, index_t NWidth, index_t Gm) -{ - std::cout << "Explicit Indexing:" << std::endl; - for(int c = 0; c < Gm; ++c) - { - std::cout << "Col " << c << ": " << std::endl; - for(int r = 0; r < Gm; ++r) - { - for(int m = 0; m < MWidth; ++m) - { - std::cout << "Row " << r << " (sub-row " << m << "): "; - for(int n = 0; n < NWidth; ++n) - { - int idx = c + Gm * m + Gm * MWidth * n + Gm * MWidth * NWidth * r; - std::cout << data[idx] << " "; - } - if(MWidth > 1) - { - std::cout << std::endl; - } - } - std::cout << std::endl; - } - std::cout << std::endl; - } -} - -void debug_print_tensor_desc(const std::vector& data, const auto& desc) -{ - const auto lengths = desc.get_lengths(); - std::cout << "Using Tensor Descriptor:" << std::endl; - for(int c = 0; c < lengths[number<1>{}]; ++c) - { - std::cout << "Col " << c << ": " << std::endl; - for(int r = 0; r < lengths[number<0>{}]; ++r) - { - for(int m = 0; m < lengths[number<2>{}]; ++m) - { - std::cout << "Row " << r << " (sub-row " << m << "): "; - for(int n = 0; n < lengths[number<3>{}]; ++n) - { - const auto block_coord = make_tuple(r, c, m, n); - const auto idx = desc.calculate_offset(block_coord); - std::cout << data[idx] << " "; - } - if(lengths[number<2>{}] > 1) - { - std::cout << std::endl; - } - } - std::cout << std::endl; - } - std::cout << std::endl; - } -} - TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks) { constexpr index_t MPerBlock = 2; @@ -119,7 +63,6 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks) constexpr auto desc = make_blocked_tensor_descriptor(); - debug_print_explicit(data, MWidth, NWidth, Gm); std::vector data_explicit_indexing(MPerBlock * NPerBlock, -1); for(int col = 0; col < Gm; ++col) { @@ -133,7 +76,6 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks) } } - debug_print_tensor_desc(data, desc); std::vector data_tensor_desc(MPerBlock * NPerBlock, -1); for(int col = 0; col < Gm; ++col) { @@ -154,73 +96,3 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks) // Verify both methods yield the same result EXPECT_EQ(data_explicit_indexing, data_tensor_desc); } - -TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_2x4_blocks) -{ - constexpr index_t MPerBlock = 4; - constexpr index_t NPerBlock = 8; - constexpr index_t Gm = 2; - constexpr index_t MWidth = MPerBlock / Gm; - constexpr index_t NWidth = NPerBlock / Gm; - - // This data represents a 4x8 matrix divided into 2x2 blocks of size 2x4 each - // Block structure: - // Block(0,0) | Block(0,1) - // ---------------------- - // Block(1,0) | Block(1,1) - // clang-format off - std::vector data { - 0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15, - 16, 20, 24, 28, 17, 21, 25, 29, - 18, 22, 26, 30, 19, 23, 27, 31 - }; - // clang-format on - - constexpr auto desc = make_blocked_tensor_descriptor(); - - debug_print_explicit(data, MWidth, NWidth, Gm); - debug_print_tensor_desc(data, desc); -} - -TEST_F(TestTensorDescriptor, GetSubBlockWithVectorizedAccess) -{ - constexpr index_t MPerBlock = 4; - constexpr index_t NPerBlock = 8; - constexpr index_t Gm = 2; - - // This data represents a 4x8 matrix divided into 2x2 blocks of size 2x4 each - // Block structure: - // Block(0,0) | Block(0,1) - // ---------------------- - // Block(1,0) | Block(1,1) - // clang-format off - std::vector data_vec { - 0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15, - 16, 20, 24, 28, 17, 21, 25, 29, - 18, 22, 26, 30, 19, 23, 27, 31 - }; - // clang-format on - - constexpr auto desc = make_blocked_tensor_descriptor(); - - const auto tensor_view = make_tensor_view(reinterpret_cast(data_vec.data()), desc); - - const auto base_addr = make_multi_index(number<1>{}, number<1>{}, number<0>{}, number<0>{}); - const auto block_offset = make_tensor_coordinate(desc, base_addr); - - // First row of sub-block (1,1) - const auto row1 = tensor_view.get_vectorized_elements(block_offset, 0); - EXPECT_EQ(row1.x, 20); - EXPECT_EQ(row1.y, 21); - EXPECT_EQ(row1.z, 22); - EXPECT_EQ(row1.w, 23); - - // Second row of sub-block (1,1) - const auto row2 = tensor_view.get_vectorized_elements(block_offset, 1); - EXPECT_EQ(row2.x, 28); - EXPECT_EQ(row2.y, 29); - EXPECT_EQ(row2.z, 30); - EXPECT_EQ(row2.w, 31); -} diff --git a/test/ck_tile/tensor_view/test_tensor_view.cpp b/test/ck_tile/tensor_view/test_tensor_view.cpp index 48a91217f2..21a4adc829 100644 --- a/test/ck_tile/tensor_view/test_tensor_view.cpp +++ b/test/ck_tile/tensor_view/test_tensor_view.cpp @@ -712,44 +712,12 @@ __global__ void test_4x4_matrix_get_2x2_blocks_with_sfc_and_lds_kernel(int* inpu store_tile(in_lds_window, lds_tile); block_sync_lds(); - // Print the contents of LDS - if(threadIdx.x == 0 && blockIdx.x == 0) - { - printf("LDS contents:\n"); - int* lds_data = reinterpret_cast(p_smem); - for(index_t i = 0; i < 4; i++) - { - for(index_t j = 0; j < 4; j++) - { - printf("%3d ", lds_data[i * 4 + j]); - } - printf("\n"); - } - } - // For the output tensor, we need to copy only the diagonal 2x2 blocks to global memory. static_for<0, NumGroupsToMerge, 1>{}([&](auto group) { auto out_tensor = load_tile(make_tile_window(out_lds_window, output_tile_distribution)); store_tile(output_window, out_tensor); - // Print the output tensor contents. - __syncthreads(); - if(threadIdx.x == 0 && blockIdx.x == 0) - { - - printf("Output tensor contents after loading group %d:\n", group.value); - for(index_t i = 0; i < 4; i++) - { - for(index_t j = 0; j < 2; j++) - { - printf("%3d", output[i * 2 + j]); - } - printf("\n"); - } - } - __syncthreads(); - // Moving output window works correctly. if constexpr(group != NumGroupsToMerge - 1) {