Remove debug prints and obsolete tests.

This commit is contained in:
Ville Pietilä
2025-10-07 13:49:18 +00:00
parent d9e9f19ca4
commit 438787dbb6
2 changed files with 0 additions and 160 deletions

View File

@@ -42,62 +42,6 @@ constexpr auto make_blocked_tensor_descriptor()
return desc_4d;
}
void debug_print_explicit(const std::vector<int>& data, index_t MWidth, index_t NWidth, index_t Gm)
{
std::cout << "Explicit Indexing:" << std::endl;
for(int c = 0; c < Gm; ++c)
{
std::cout << "Col " << c << ": " << std::endl;
for(int r = 0; r < Gm; ++r)
{
for(int m = 0; m < MWidth; ++m)
{
std::cout << "Row " << r << " (sub-row " << m << "): ";
for(int n = 0; n < NWidth; ++n)
{
int idx = c + Gm * m + Gm * MWidth * n + Gm * MWidth * NWidth * r;
std::cout << data[idx] << " ";
}
if(MWidth > 1)
{
std::cout << std::endl;
}
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
void debug_print_tensor_desc(const std::vector<int>& data, const auto& desc)
{
const auto lengths = desc.get_lengths();
std::cout << "Using Tensor Descriptor:" << std::endl;
for(int c = 0; c < lengths[number<1>{}]; ++c)
{
std::cout << "Col " << c << ": " << std::endl;
for(int r = 0; r < lengths[number<0>{}]; ++r)
{
for(int m = 0; m < lengths[number<2>{}]; ++m)
{
std::cout << "Row " << r << " (sub-row " << m << "): ";
for(int n = 0; n < lengths[number<3>{}]; ++n)
{
const auto block_coord = make_tuple(r, c, m, n);
const auto idx = desc.calculate_offset(block_coord);
std::cout << data[idx] << " ";
}
if(lengths[number<2>{}] > 1)
{
std::cout << std::endl;
}
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
{
constexpr index_t MPerBlock = 2;
@@ -119,7 +63,6 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
constexpr auto desc = make_blocked_tensor_descriptor<MPerBlock, NPerBlock, Gm>();
debug_print_explicit(data, MWidth, NWidth, Gm);
std::vector<int> data_explicit_indexing(MPerBlock * NPerBlock, -1);
for(int col = 0; col < Gm; ++col)
{
@@ -133,7 +76,6 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
}
}
debug_print_tensor_desc(data, desc);
std::vector<int> data_tensor_desc(MPerBlock * NPerBlock, -1);
for(int col = 0; col < Gm; ++col)
{
@@ -154,73 +96,3 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
// Verify both methods yield the same result
EXPECT_EQ(data_explicit_indexing, data_tensor_desc);
}
TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_2x4_blocks)
{
constexpr index_t MPerBlock = 4;
constexpr index_t NPerBlock = 8;
constexpr index_t Gm = 2;
constexpr index_t MWidth = MPerBlock / Gm;
constexpr index_t NWidth = NPerBlock / Gm;
// This data represents a 4x8 matrix divided into 2x2 blocks of size 2x4 each
// Block structure:
// Block(0,0) | Block(0,1)
// ----------------------
// Block(1,0) | Block(1,1)
// clang-format off
std::vector<int> data {
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
16, 20, 24, 28, 17, 21, 25, 29,
18, 22, 26, 30, 19, 23, 27, 31
};
// clang-format on
constexpr auto desc = make_blocked_tensor_descriptor<MPerBlock, NPerBlock, Gm>();
debug_print_explicit(data, MWidth, NWidth, Gm);
debug_print_tensor_desc(data, desc);
}
TEST_F(TestTensorDescriptor, GetSubBlockWithVectorizedAccess)
{
constexpr index_t MPerBlock = 4;
constexpr index_t NPerBlock = 8;
constexpr index_t Gm = 2;
// This data represents a 4x8 matrix divided into 2x2 blocks of size 2x4 each
// Block structure:
// Block(0,0) | Block(0,1)
// ----------------------
// Block(1,0) | Block(1,1)
// clang-format off
std::vector<int> data_vec {
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
16, 20, 24, 28, 17, 21, 25, 29,
18, 22, 26, 30, 19, 23, 27, 31
};
// clang-format on
constexpr auto desc = make_blocked_tensor_descriptor<MPerBlock, NPerBlock, Gm>();
const auto tensor_view = make_tensor_view(reinterpret_cast<int4*>(data_vec.data()), desc);
const auto base_addr = make_multi_index(number<1>{}, number<1>{}, number<0>{}, number<0>{});
const auto block_offset = make_tensor_coordinate(desc, base_addr);
// First row of sub-block (1,1)
const auto row1 = tensor_view.get_vectorized_elements<int4>(block_offset, 0);
EXPECT_EQ(row1.x, 20);
EXPECT_EQ(row1.y, 21);
EXPECT_EQ(row1.z, 22);
EXPECT_EQ(row1.w, 23);
// Second row of sub-block (1,1)
const auto row2 = tensor_view.get_vectorized_elements<int4>(block_offset, 1);
EXPECT_EQ(row2.x, 28);
EXPECT_EQ(row2.y, 29);
EXPECT_EQ(row2.z, 30);
EXPECT_EQ(row2.w, 31);
}

View File

@@ -712,44 +712,12 @@ __global__ void test_4x4_matrix_get_2x2_blocks_with_sfc_and_lds_kernel(int* inpu
store_tile(in_lds_window, lds_tile);
block_sync_lds();
// Print the contents of LDS
if(threadIdx.x == 0 && blockIdx.x == 0)
{
printf("LDS contents:\n");
int* lds_data = reinterpret_cast<int*>(p_smem);
for(index_t i = 0; i < 4; i++)
{
for(index_t j = 0; j < 4; j++)
{
printf("%3d ", lds_data[i * 4 + j]);
}
printf("\n");
}
}
// For the output tensor, we need to copy only the diagonal 2x2 blocks to global memory.
static_for<0, NumGroupsToMerge, 1>{}([&](auto group) {
auto out_tensor = load_tile(make_tile_window(out_lds_window, output_tile_distribution));
store_tile(output_window, out_tensor);
// Print the output tensor contents.
__syncthreads();
if(threadIdx.x == 0 && blockIdx.x == 0)
{
printf("Output tensor contents after loading group %d:\n", group.value);
for(index_t i = 0; i < 4; i++)
{
for(index_t j = 0; j < 2; j++)
{
printf("%3d", output[i * 2 + j]);
}
printf("\n");
}
}
__syncthreads();
// Moving output window works correctly.
if constexpr(group != NumGroupsToMerge - 1)
{