mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 12:17:00 +00:00
Remove debug prints and obsolete tests.
This commit is contained in:
@@ -42,62 +42,6 @@ constexpr auto make_blocked_tensor_descriptor()
|
||||
return desc_4d;
|
||||
}
|
||||
|
||||
void debug_print_explicit(const std::vector<int>& data, index_t MWidth, index_t NWidth, index_t Gm)
|
||||
{
|
||||
std::cout << "Explicit Indexing:" << std::endl;
|
||||
for(int c = 0; c < Gm; ++c)
|
||||
{
|
||||
std::cout << "Col " << c << ": " << std::endl;
|
||||
for(int r = 0; r < Gm; ++r)
|
||||
{
|
||||
for(int m = 0; m < MWidth; ++m)
|
||||
{
|
||||
std::cout << "Row " << r << " (sub-row " << m << "): ";
|
||||
for(int n = 0; n < NWidth; ++n)
|
||||
{
|
||||
int idx = c + Gm * m + Gm * MWidth * n + Gm * MWidth * NWidth * r;
|
||||
std::cout << data[idx] << " ";
|
||||
}
|
||||
if(MWidth > 1)
|
||||
{
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void debug_print_tensor_desc(const std::vector<int>& data, const auto& desc)
|
||||
{
|
||||
const auto lengths = desc.get_lengths();
|
||||
std::cout << "Using Tensor Descriptor:" << std::endl;
|
||||
for(int c = 0; c < lengths[number<1>{}]; ++c)
|
||||
{
|
||||
std::cout << "Col " << c << ": " << std::endl;
|
||||
for(int r = 0; r < lengths[number<0>{}]; ++r)
|
||||
{
|
||||
for(int m = 0; m < lengths[number<2>{}]; ++m)
|
||||
{
|
||||
std::cout << "Row " << r << " (sub-row " << m << "): ";
|
||||
for(int n = 0; n < lengths[number<3>{}]; ++n)
|
||||
{
|
||||
const auto block_coord = make_tuple(r, c, m, n);
|
||||
const auto idx = desc.calculate_offset(block_coord);
|
||||
std::cout << data[idx] << " ";
|
||||
}
|
||||
if(lengths[number<2>{}] > 1)
|
||||
{
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
|
||||
{
|
||||
constexpr index_t MPerBlock = 2;
|
||||
@@ -119,7 +63,6 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
|
||||
|
||||
constexpr auto desc = make_blocked_tensor_descriptor<MPerBlock, NPerBlock, Gm>();
|
||||
|
||||
debug_print_explicit(data, MWidth, NWidth, Gm);
|
||||
std::vector<int> data_explicit_indexing(MPerBlock * NPerBlock, -1);
|
||||
for(int col = 0; col < Gm; ++col)
|
||||
{
|
||||
@@ -133,7 +76,6 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
|
||||
}
|
||||
}
|
||||
|
||||
debug_print_tensor_desc(data, desc);
|
||||
std::vector<int> data_tensor_desc(MPerBlock * NPerBlock, -1);
|
||||
for(int col = 0; col < Gm; ++col)
|
||||
{
|
||||
@@ -154,73 +96,3 @@ TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_1x4_blocks)
|
||||
// Verify both methods yield the same result
|
||||
EXPECT_EQ(data_explicit_indexing, data_tensor_desc);
|
||||
}
|
||||
|
||||
TEST_F(TestTensorDescriptor, RowMajorBlocksWithColumnMajorData_2x4_blocks)
|
||||
{
|
||||
constexpr index_t MPerBlock = 4;
|
||||
constexpr index_t NPerBlock = 8;
|
||||
constexpr index_t Gm = 2;
|
||||
constexpr index_t MWidth = MPerBlock / Gm;
|
||||
constexpr index_t NWidth = NPerBlock / Gm;
|
||||
|
||||
// This data represents a 4x8 matrix divided into 2x2 blocks of size 2x4 each
|
||||
// Block structure:
|
||||
// Block(0,0) | Block(0,1)
|
||||
// ----------------------
|
||||
// Block(1,0) | Block(1,1)
|
||||
// clang-format off
|
||||
std::vector<int> data {
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
16, 20, 24, 28, 17, 21, 25, 29,
|
||||
18, 22, 26, 30, 19, 23, 27, 31
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
constexpr auto desc = make_blocked_tensor_descriptor<MPerBlock, NPerBlock, Gm>();
|
||||
|
||||
debug_print_explicit(data, MWidth, NWidth, Gm);
|
||||
debug_print_tensor_desc(data, desc);
|
||||
}
|
||||
|
||||
TEST_F(TestTensorDescriptor, GetSubBlockWithVectorizedAccess)
|
||||
{
|
||||
constexpr index_t MPerBlock = 4;
|
||||
constexpr index_t NPerBlock = 8;
|
||||
constexpr index_t Gm = 2;
|
||||
|
||||
// This data represents a 4x8 matrix divided into 2x2 blocks of size 2x4 each
|
||||
// Block structure:
|
||||
// Block(0,0) | Block(0,1)
|
||||
// ----------------------
|
||||
// Block(1,0) | Block(1,1)
|
||||
// clang-format off
|
||||
std::vector<int> data_vec {
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
16, 20, 24, 28, 17, 21, 25, 29,
|
||||
18, 22, 26, 30, 19, 23, 27, 31
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
constexpr auto desc = make_blocked_tensor_descriptor<MPerBlock, NPerBlock, Gm>();
|
||||
|
||||
const auto tensor_view = make_tensor_view(reinterpret_cast<int4*>(data_vec.data()), desc);
|
||||
|
||||
const auto base_addr = make_multi_index(number<1>{}, number<1>{}, number<0>{}, number<0>{});
|
||||
const auto block_offset = make_tensor_coordinate(desc, base_addr);
|
||||
|
||||
// First row of sub-block (1,1)
|
||||
const auto row1 = tensor_view.get_vectorized_elements<int4>(block_offset, 0);
|
||||
EXPECT_EQ(row1.x, 20);
|
||||
EXPECT_EQ(row1.y, 21);
|
||||
EXPECT_EQ(row1.z, 22);
|
||||
EXPECT_EQ(row1.w, 23);
|
||||
|
||||
// Second row of sub-block (1,1)
|
||||
const auto row2 = tensor_view.get_vectorized_elements<int4>(block_offset, 1);
|
||||
EXPECT_EQ(row2.x, 28);
|
||||
EXPECT_EQ(row2.y, 29);
|
||||
EXPECT_EQ(row2.z, 30);
|
||||
EXPECT_EQ(row2.w, 31);
|
||||
}
|
||||
|
||||
@@ -712,44 +712,12 @@ __global__ void test_4x4_matrix_get_2x2_blocks_with_sfc_and_lds_kernel(int* inpu
|
||||
store_tile(in_lds_window, lds_tile);
|
||||
block_sync_lds();
|
||||
|
||||
// Print the contents of LDS
|
||||
if(threadIdx.x == 0 && blockIdx.x == 0)
|
||||
{
|
||||
printf("LDS contents:\n");
|
||||
int* lds_data = reinterpret_cast<int*>(p_smem);
|
||||
for(index_t i = 0; i < 4; i++)
|
||||
{
|
||||
for(index_t j = 0; j < 4; j++)
|
||||
{
|
||||
printf("%3d ", lds_data[i * 4 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// For the output tensor, we need to copy only the diagonal 2x2 blocks to global memory.
|
||||
static_for<0, NumGroupsToMerge, 1>{}([&](auto group) {
|
||||
auto out_tensor = load_tile(make_tile_window(out_lds_window, output_tile_distribution));
|
||||
|
||||
store_tile(output_window, out_tensor);
|
||||
|
||||
// Print the output tensor contents.
|
||||
__syncthreads();
|
||||
if(threadIdx.x == 0 && blockIdx.x == 0)
|
||||
{
|
||||
|
||||
printf("Output tensor contents after loading group %d:\n", group.value);
|
||||
for(index_t i = 0; i < 4; i++)
|
||||
{
|
||||
for(index_t j = 0; j < 2; j++)
|
||||
{
|
||||
printf("%3d", output[i * 2 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Moving output window works correctly.
|
||||
if constexpr(group != NumGroupsToMerge - 1)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user