Merge remote-tracking branch 'origin/develop' into tianyuwu/ck_tile/WMMA_GEMM_F16

This commit is contained in:
TianyuanWu
2025-08-08 17:37:36 +08:00
183 changed files with 8839 additions and 1839 deletions

View File

@@ -21,3 +21,5 @@ add_subdirectory(add_rmsnorm2d_rdquant)
# add_subdirectory(layernorm2d)
# add_subdirectory(rmsnorm2d)
add_subdirectory(gemm_block_scale)
add_subdirectory(utility)
add_subdirectory(reduce)

View File

@@ -95,10 +95,12 @@ class TestCkTileBatchedTranspose // N C H W layout_in==
ck_tile::HostTensor<DataType> y_ref(Y_dim, Y_stride);
ck_tile::FillUniformDistribution<DataType>{-.5f, .5f}(x_host);
ck_tile::FillConstant<DataType>{-37}(y_host);
ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
x_dev.ToDevice(x_host.data());
y_dev.ToDevice(y_host.data());
using Kernel = typename Config::Kernel;
@@ -131,8 +133,8 @@ class TestCkTileBatchedTranspose // N C H W layout_in==
height,
width,
height * width,
Config::BlockTile::at(1),
Config::BlockTile::at(0)};
Config::BlockTile::at(0),
Config::BlockTile::at(1)};
auto kargs = Kernel::MakeKargs(host_args);
auto sc = ck_tile::stream_config{};
@@ -140,15 +142,24 @@ class TestCkTileBatchedTranspose // N C H W layout_in==
constexpr dim3 block_size = Kernel::BlockSize();
ck_tile::launch_kernel(
sc, ck_tile::make_kernel<block_size.x, 1>(Kernel{}, grid_size, block_size, 0, kargs));
y_dev.FromDevice(y_host.data());
ck_tile::reference_batched_transpose<DataType>(x_host, y_ref, layout_in, layout_out);
std::ostringstream message;
message << "N=" << N << " C=" << C << " H=" << H << " W=" << W << " layout_in=" << layout_in
<< " layout_out=" << layout_out << " device_name=" << device_name;
<< " layout_out=" << layout_out << " grid_size={" << grid_size.x << ", "
<< grid_size.y << ", " << grid_size.z << "} block_size=" << block_size.x
<< " device_name=" << device_name;
// NB: order of output and reference matters
bool pass = ck_tile::check_err(
y_ref, y_host, message.str(), /* rtol */ 0, /* atol */ 0, /* allow inf */ false);
/* out */ y_host,
/* ref */ y_ref,
message.str(),
/* rtol */ 0,
/* atol */ 0,
/* allow inf */ false);
EXPECT_TRUE(pass);
}
@@ -160,14 +171,16 @@ static const auto kTestingValues = ::testing::Values(
// N C H W layout_in==NCHW
std::tuple{1, 32, 1, 32, true},
std::tuple{1, 64, 1, 64, true},
std::tuple{1, 32, 1, 64, true},
std::tuple{1, 64, 1, 32, true},
std::tuple{2, 12, 1, 32, false},
std::tuple{3, 1334, 1, 37, false},
std::tuple{4, 27, 1, 32, true},
std::tuple{5, 1234, 1, 12, true},
std::tuple{1, 1, 1, 1, true},
std::tuple{1, 1, 1, 1, false},
std::tuple{128, 1024, 64, 64, true},
std::tuple{128, 1024, 64, 64, false},
std::tuple{17, 1024, 64, 64, true},
std::tuple{17, 1024, 64, 64, false},
std::tuple{16, 64, 32, 128, true},
std::tuple{16, 64, 128, 32, false},
std::tuple{1, 2048, 1, 1, true},
@@ -239,6 +252,60 @@ class CaseHalfPadMultiWarpLoadTranspose
{
};
class CaseHalfPadMultiWarp128MNLoadTranspose
: public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
PipelineTag::LDSLoadTranspose,
128,
128,
2,
2,
false,
false>>
{
};
class CaseHalfPadMultiWarp128MN
: public TestCkTileBatchedTranspose<
PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 128, 128, 2, 2, false, false>>
{
};
class CaseHalfPadRectTile1
: public TestCkTileBatchedTranspose<
PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 32, 64, 1, 1, false, false>>
{
};
class CaseHalfPadRectTile2
: public TestCkTileBatchedTranspose<
PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 64, 32, 1, 1, false, false>>
{
};
class CaseHalfPadRectTile1LoadTranspose
: public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
PipelineTag::LDSLoadTranspose,
32,
64,
1,
1,
false,
false>>
{
};
class CaseHalfPadRectTile2LoadTranspose
: public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
PipelineTag::LDSLoadTranspose,
64,
32,
1,
1,
false,
false>>
{
};
TEST_P(CaseHalf, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseByte, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseWord, TestCorrectness) { this->Run(GetParam()); }
@@ -248,6 +315,12 @@ TEST_P(CaseHalfPad, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadMultiWarp, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadMultiWarpLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadMultiWarp128MN, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadMultiWarp128MNLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadRectTile1, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadRectTile1LoadTranspose, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadRectTile2, TestCorrectness) { this->Run(GetParam()); }
TEST_P(CaseHalfPadRectTile2LoadTranspose, TestCorrectness) { this->Run(GetParam()); }
// clang-format off
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalf, kTestingValues);
@@ -259,4 +332,11 @@ INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPad, kTestingV
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadLoadTranspose, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarpLoadTranspose, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MN, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MNLoadTranspose, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1LoadTranspose, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2, kTestingValues);
INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2LoadTranspose, kTestingValues);
// clang-format on

View File

@@ -3,11 +3,13 @@ if(GPU_TARGETS MATCHES "gfx9")
endif()
if(GPU_TARGETS MATCHES "gfx95")
add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp)
add_gtest_executable(test_ck_tile_mx_scale test_mx_scale.cpp)
endif()
if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8)
add_gtest_executable(test_ck_tile_fp8 test_fp8.cpp)
target_compile_options(test_ck_tile_fp8 PRIVATE -Wno-float-equal)
target_compile_definitions(test_ck_tile_fp8 PUBLIC GTEST_HAS_RTTI=0)
# conditionally specify the use of OCP_FP8
if(CK_USE_OCP_FP8)
target_compile_options(test_ck_tile_fp8 PRIVATE -DCK_TILE_USE_OCP_FP8)

View File

@@ -0,0 +1,162 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include <hip/hip_runtime.h>
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
using ck_tile::bf16_t;
using ck_tile::bf16x2_t;
using ck_tile::fp16_t;
using ck_tile::fp16x2_t;
using ck_tile::fp32_t;
using ck_tile::fp32x2_t;
using ck_tile::number;
using ck_tile::pk_fp4_t;
template <typename SRC, typename DST, bool is_device>
CK_TILE_HOST void test_convert();
using ck_tile::e8m0_raw_t;
using ck_tile::e8m0_t;
TEST(OCP_Scale, NumericLimits)
{
EXPECT_EQ(ck_tile::numeric<e8m0_t>::has_inf(), false);
EXPECT_EQ(ck_tile::numeric<e8m0_t>::zero(), ck_tile::numeric<e8m0_t>::signaling_NaN());
EXPECT_EQ(ck_tile::numeric<e8m0_t>::min(), e8m0_t{e8m0_raw_t{0b00000000}});
EXPECT_EQ(ck_tile::numeric<e8m0_t>::max(), e8m0_t{e8m0_raw_t{0b11111110}});
}
TEST(OCP_Scale, NumericBasic)
{
auto scale_1 = e8m0_t{1.0f};
auto scale_2 = e8m0_t{e8m0_raw_t{ck_tile::numeric_traits<e8m0_t>::bias}}; // 2^0
EXPECT_EQ(scale_1, scale_2);
auto scale_3 = e8m0_t{8.0f};
auto scale_4 = e8m0_t{e8m0_raw_t{3 + ck_tile::numeric_traits<e8m0_t>::bias}}; // 2^3
EXPECT_EQ(scale_3, scale_4);
}
TEST(OCP_Scale, ScaledConvertDevice)
{
constexpr bool is_device = true;
test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
test_convert<fp16_t, fp16_t, is_device>();
test_convert<bf16_t, bf16_t, is_device>();
test_convert<fp32_t, fp16_t, is_device>();
test_convert<fp32_t, bf16_t, is_device>();
test_convert<fp16_t, fp32_t, is_device>();
test_convert<bf16_t, fp32_t, is_device>();
}
TEST(OCP_Scale, ScaledConvertHost)
{
constexpr bool is_device = false;
test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
test_convert<fp16_t, fp16_t, is_device>();
test_convert<bf16_t, bf16_t, is_device>();
test_convert<fp32_t, fp16_t, is_device>();
test_convert<fp32_t, bf16_t, is_device>();
test_convert<fp16_t, fp32_t, is_device>();
test_convert<bf16_t, fp32_t, is_device>();
}
TEST(OCP_Scale, tensorInit)
{
using scale_t = e8m0_t;
ck_tile::HostTensor<scale_t> scales({10, 10});
ck_tile::FillUniformDistribution<scale_t>{1.f, 1.f}(scales);
scales.SetZero();
}
#define toPF4(x, y) ck_tile::scaled_type_convert<pk_fp4_t>(x, y)
#define toDST(x, y) ck_tile::scaled_type_convert<DST>(x, y)
#define toDSTx2(x, y) ck_tile::scaled_type_convert<DSTx2_t>(x, y)
#define toF32(x) ck_tile::type_convert<float>(x)
#define toPF4_(x) ck_tile::type_convert<pk_fp4_t>(x)
#define toSRC(x) ck_tile::type_convert<SRC>(x)
#define toDST_(x) ck_tile::type_convert<DST>(x)
template <typename Kernel, typename... Args>
__global__ void MyKernel(Args... args)
{
Kernel{}(args...);
}
template <typename SRC, typename DST, int N>
struct SrcPkfp4Dst
{
CK_TILE_HOST_DEVICE void
operator()(const SRC* src, DST* dst, e8m0_t scale1, e8m0_t scale2) const
{
using SRCx2_t = ck_tile::ext_vector_t<SRC, 2>;
using DSTx2_t = ck_tile::ext_vector_t<DST, 2>;
ck_tile::static_for<0, N, 2>{}([&](auto i) {
const auto input2 = SRCx2_t{src[i], src[i + 1]};
if(i % 4 == 0)
{
// ex: fp32_t -> fp4 -> bf16_t
dst[i] = toDST(toPF4(src[i], scale1), scale2);
// ex: fp32x2_t -> pk_fp4 -> unpack<0> -> bf16_t
dst[i + 1] = toDST(toPF4_(toPF4(input2, scale1).unpack(number<1>{})), scale2);
}
else
{
// ex: fp32x2_t -> pk_fp4_t -> bf16x2_t
reinterpret_cast<DSTx2_t*>(dst)[i >> 1] = toDSTx2(toPF4(input2, scale1), scale2);
}
});
}
};
template <typename SRC, typename DST, bool is_device>
CK_TILE_HOST void test_convert()
{
const auto test_data = std::array{4.f, 6.f, 8.f, 10.f};
const auto ref_data = std::array{8.f, 16.f, 16.f, 16.f};
const auto scale1 = e8m0_t{8.0f};
const auto scale2 = e8m0_t{16.0f};
static_assert(test_data.size() == ref_data.size());
static_assert(test_data.size() % 2 == 0);
constexpr int N = test_data.size();
std::array<SRC, N> in;
std::array<DST, N> ref, out;
// prepare input and ground truth in host
for(int i = 0; i < N; ++i)
{
in[i] = toSRC(test_data[i]);
ref[i] = toDST_(ref_data[i]);
EXPECT_EQ(test_data[i], toF32(in[i]));
EXPECT_EQ(ref_data[i], toF32(ref[i]));
}
using job = SrcPkfp4Dst<SRC, DST, N>;
if constexpr(is_device)
{
auto in_d = std::make_unique<ck_tile::DeviceMem>(in.size() * sizeof(SRC));
auto out_d = std::make_unique<ck_tile::DeviceMem>(out.size() * sizeof(DST));
in_d->ToDevice(in.data());
MyKernel<job><<<1, 1>>>(reinterpret_cast<const SRC*>(in_d->GetDeviceBuffer()),
reinterpret_cast<DST*>(out_d->GetDeviceBuffer()),
scale1,
scale2);
out_d->FromDevice(out.data());
}
else
{
job{}(in.data(), out.data(), scale1, scale2);
}
for(int i = 0; i < N; ++i)
EXPECT_EQ(ref[i], out[i]) << "i:" << i;
}

View File

@@ -0,0 +1,7 @@
if(GPU_TARGETS MATCHES "gfx9")
add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp)
if(result EQUAL 0)
target_link_libraries(test_ck_tile_reduce2d PRIVATE utility)
endif()
endif()

View File

@@ -0,0 +1,359 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <gtest/gtest.h>
#include <vector>
#include <cmath>
#include <tuple>
#include <iostream>
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/reduce.hpp"
#include "ck_tile/host/kernel_launch.hpp"
template <typename Tuple>
class TestCkTileReduce : public ::testing::Test
{
protected:
using XDataType = std::tuple_element_t<0, Tuple>;
using ComputeDataType = std::tuple_element_t<1, Tuple>;
using YDataType = std::tuple_element_t<2, Tuple>;
using ReduceOpType = std::tuple_element_t<3, Tuple>;
using BlockWarps_ = std::tuple_element_t<4, Tuple>;
using BlockTile_ = std::tuple_element_t<5, Tuple>;
using WarpTile_ = std::tuple_element_t<6, Tuple>;
using ThreadTile_ = std::tuple_element_t<7, Tuple>;
using TestReduce2dShape =
ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, ThreadTile_>;
template <std::size_t InputDim, typename KeptDimSeq, typename ReduceDimSeq>
void RunGenericTest(const std::vector<ck_tile::index_t>& input_shape,
const std::vector<ck_tile::index_t>& input_strides,
const std::vector<ck_tile::index_t>& output_shape,
const std::vector<ck_tile::index_t>& output_strides,
ck_tile::index_t kept_dim_len_prod,
ck_tile::index_t total_reduce_elements,
KeptDimSeq kept_dims,
ReduceDimSeq reduce_dims)
{
ck_tile::HostTensor<XDataType> h_x(input_shape, input_strides);
ck_tile::HostTensor<YDataType> h_y(output_shape, output_strides);
ck_tile::HostTensor<YDataType> h_y_ref(output_shape, output_strides);
ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(h_x);
h_y.SetZero();
h_y_ref.SetZero();
ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes());
ck_tile::DeviceMem d_y_mem(h_y.get_element_space_size_in_bytes());
d_x_mem.ToDevice(h_x.data());
d_y_mem.ToDevice(h_y.data()); // Initialize device output buffer
// Problem and kernel setup
using Problem = ck_tile::
Reduce2dProblem<XDataType, ComputeDataType, YDataType, TestReduce2dShape, ReduceOpType>;
using Kernel = ck_tile::Reduce<Problem>;
// Launch configuration
constexpr ck_tile::index_t kBlockSize = 256;
constexpr ck_tile::index_t kBlockPerCu = 1;
ck_tile::index_t kGridSize =
(kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M;
// Generic helper to create tuple from vector based on compile-time size
auto make_shape_tuple = []<std::size_t N>(const std::vector<ck_tile::index_t>& vec) {
return [&vec]<std::size_t... I>(std::index_sequence<I...>) {
return ck_tile::make_tuple(vec[I]...);
}(std::make_index_sequence<N>{});
};
auto input_shape_tuple = make_shape_tuple.template operator()<InputDim>(input_shape);
auto input_strides_tuple = make_shape_tuple.template operator()<InputDim>(input_strides);
if(!Kernel::IsSupportedArgument(
output_shape[output_shape.size() - 1],
input_strides_tuple)) // output tensor's continuous dimension
{
throw std::runtime_error("Wrong! Arguments not supported!\n");
}
ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
input_shape_tuple,
input_strides_tuple,
kept_dims,
reduce_dims));
// Get results back
d_y_mem.FromDevice(h_y.data());
// Reference computation
ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
h_x, h_y_ref, ReduceOpType{}, kept_dims, reduce_dims);
// Calculate proper error thresholds based on data types and number of accumulations
const auto rtol = ck_tile::get_relative_threshold<XDataType, YDataType, ComputeDataType>(
total_reduce_elements);
const auto atol = ck_tile::get_absolute_threshold<XDataType, YDataType, ComputeDataType>(
5.0f, total_reduce_elements);
bool result =
ck_tile::check_err(h_y, h_y_ref, "Error: Incorrect reduce results!", rtol, atol);
EXPECT_TRUE(result);
}
// Convenience functions for specific dimensional patterns
void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1)
{
constexpr auto kept_dims = ck_tile::sequence<0>{};
constexpr auto reduce_dims = ck_tile::sequence<1>{};
// Input shape and strides
std::vector<ck_tile::index_t> input_shape = {dim0, dim1};
std::vector<ck_tile::index_t> input_strides = {dim1, 1};
// Output shape and strides (keep dim0)
std::vector<ck_tile::index_t> output_shape = {dim0};
std::vector<ck_tile::index_t> output_strides = {1};
// Calculate products
ck_tile::index_t kept_dim_len_prod = dim0;
ck_tile::index_t total_reduce_elements = dim1;
RunGenericTest<2>(input_shape,
input_strides,
output_shape,
output_strides,
kept_dim_len_prod,
total_reduce_elements,
kept_dims,
reduce_dims);
}
void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0,
ck_tile::index_t dim1,
ck_tile::index_t dim2)
{
constexpr auto kept_dims = ck_tile::sequence<0>{};
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
// Input shape and strides
std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
// Output shape and strides (keep dim0)
std::vector<ck_tile::index_t> output_shape = {dim0};
std::vector<ck_tile::index_t> output_strides = {1};
// Calculate products
ck_tile::index_t kept_dim_len_prod = dim0; // product of kept dimensions
ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions
RunGenericTest<3>(input_shape,
input_strides,
output_shape,
output_strides,
kept_dim_len_prod,
total_reduce_elements,
kept_dims,
reduce_dims);
}
void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0,
ck_tile::index_t dim1,
ck_tile::index_t dim2)
{
constexpr auto kept_dims = ck_tile::sequence<0, 1>{};
constexpr auto reduce_dims = ck_tile::sequence<2>{};
// Input shape and strides
std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
// Output shape and strides (keep dim0)
std::vector<ck_tile::index_t> output_shape = {dim0, dim1};
std::vector<ck_tile::index_t> output_strides = {dim1, 1};
// Calculate products
ck_tile::index_t kept_dim_len_prod = dim0 * dim1; // product of kept dimensions
ck_tile::index_t total_reduce_elements = dim2; // product of reduced dimensions
RunGenericTest<3>(input_shape,
input_strides,
output_shape,
output_strides,
kept_dim_len_prod,
total_reduce_elements,
kept_dims,
reduce_dims);
}
void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N,
ck_tile::index_t C,
ck_tile::index_t H,
ck_tile::index_t W)
{
constexpr auto kept_dims = ck_tile::sequence<0, 1>{};
constexpr auto reduce_dims = ck_tile::sequence<2, 3>{};
// Input shape and strides
std::vector<ck_tile::index_t> input_shape = {N, C, H, W};
std::vector<ck_tile::index_t> input_strides = {C * H * W, H * W, W, 1};
// Output shape and strides (keep dim0, dim1)
std::vector<ck_tile::index_t> output_shape = {N, C};
std::vector<ck_tile::index_t> output_strides = {C, 1};
// Calculate products
ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions
ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
RunGenericTest<4>(input_shape,
input_strides,
output_shape,
output_strides,
kept_dim_len_prod,
total_reduce_elements,
kept_dims,
reduce_dims);
}
void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N,
ck_tile::index_t H,
ck_tile::index_t W,
ck_tile::index_t C)
{
constexpr auto kept_dims = ck_tile::sequence<0, 3>{};
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
// Input shape and strides
std::vector<ck_tile::index_t> input_shape = {N, H, W, C};
std::vector<ck_tile::index_t> input_strides = {H * W * C, W * C, C, 1};
// Output shape and strides (keep dim0, dim1)
std::vector<ck_tile::index_t> output_shape = {N, C};
std::vector<ck_tile::index_t> output_strides = {C, 1};
// Calculate products
ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions
ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
RunGenericTest<4>(input_shape,
input_strides,
output_shape,
output_strides,
kept_dim_len_prod,
total_reduce_elements,
kept_dims,
reduce_dims);
}
};
// Shape parameters for different test configurations
using Shape1_BlockWarps = ck_tile::sequence<4, 1>;
using Shape1_BlockTile = ck_tile::sequence<128, 128>;
using Shape1_WarpTile = ck_tile::sequence<32, 128>;
using Shape1_ThreadTile = ck_tile::sequence<8, 8>;
using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test
using Shape2_BlockTile = ck_tile::sequence<2, 1024>;
using Shape2_WarpTile = ck_tile::sequence<1, 512>;
using Shape2_ThreadTile = ck_tile::sequence<1, 8>;
// Test configurations for different data types and operations
using TestConfig_F32_Add = std::tuple<float,
float,
float,
ck_tile::ReduceOp::Add,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
float,
ck_tile::half_t,
ck_tile::ReduceOp::Add,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestConfig_F32_CrossWarp = std::tuple<float,
float,
float,
ck_tile::ReduceOp::Add,
Shape2_BlockWarps,
Shape2_BlockTile,
Shape2_WarpTile,
Shape2_ThreadTile>;
using TestConfig_F32_Max = std::tuple<float,
float,
float,
ck_tile::ReduceOp::Max,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestConfig_F32_SquareAdd = std::tuple<float,
float,
float,
ck_tile::ReduceOp::SquareAdd,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestTypes = ::testing::Types<TestConfig_F32_Add,
TestConfig_F16_Add,
TestConfig_F32_CrossWarp,
TestConfig_F32_Max,
TestConfig_F32_SquareAdd>;
TYPED_TEST_SUITE(TestCkTileReduce, TestTypes);
// 2D Tests - Keep dim0, reduce dim1
TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_64x32)
{
this->RunTest2D_KeepDim0_ReduceDim1(64, 32);
}
TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_1024x512)
{
this->RunTest2D_KeepDim0_ReduceDim1(1024, 512);
}
// 3D Tests - Keep dim0, reduce dim1,2
TYPED_TEST(TestCkTileReduce, Test3D_KeepDim0_ReduceDim12_128x128x1)
{
this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8);
}
// 3D Tests - Keep dim0,1, reduce dim1
TYPED_TEST(TestCkTileReduce, Test3D_KeepDim01_ReduceDim2_512x1024x16)
{
this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16);
}
// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC)
TYPED_TEST(TestCkTileReduce, Test4D_KeepDim01_ReduceDim23_32x256x16x16)
{
this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16);
}
// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC)
TYPED_TEST(TestCkTileReduce, Test4D_KeepDim03_ReduceDim12_16x32x32x128)
{
this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128);
}

View File

@@ -0,0 +1,4 @@
message("-- Adding: test/ck_tile/utility/")
# Add print tests
add_subdirectory(print)

View File

@@ -0,0 +1,8 @@
# Print utility tests
add_gtest_executable(test_print_sequence test_print_sequence.cpp)
add_gtest_executable(test_print_array test_print_array.cpp)
add_gtest_executable(test_print_tuple test_print_tuple.cpp)
add_gtest_executable(test_print_coordinate_transform test_print_coordinate_transform.cpp)
add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp)
add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp)
add_gtest_executable(test_print_basic_types test_print_basic_types.cpp)

View File

@@ -0,0 +1,70 @@
# Print Function Tests
This directory contains unit tests for testing the print functionality of various data structures and coordinate transformations in the composable_kernel library.
## Tests Included
### test_print_sequence.cpp
Tests the print functionality for `sequence<...>` containers:
- Simple sequences with multiple elements
- Single element sequences
- Empty sequences
- Longer sequences
### test_print_array.cpp
Tests the print functionality for `array<T, N>` containers:
- Arrays with integer values
- Single element arrays
- Empty arrays (size 0)
- Arrays with floating point values
### test_print_tuple.cpp
Tests the print functionality for `tuple<...>` containers:
- Simple tuples with numbers
- Single element tuples
- Empty tuples
- Mixed type tuples
### test_print_coordinate_transform.cpp
Tests the print functionality for coordinate transformation structures:
- `pass_through` transform
- `embed` transform
- `merge` transform
- `unmerge` transform
- `freeze` transform
## Testing Approach
All tests use Google Test's `CaptureStdout()` functionality to capture the output from print functions and verify the formatting:
```cpp
testing::internal::CaptureStdout();
print(object);
std::string output = testing::internal::GetCapturedStdout();
EXPECT_EQ(output, "expected_format");
```
This approach enables testing of print function output without affecting the console during test execution.
## Building and Running
The tests are integrated into the CMake build system. To build and run the print tests:
```bash
# Build the specific test
make test_print_sequence
# Run the test
./test_print_sequence
# Or run all print tests using CTest
ctest -R "test_print"
```
## Adding New Tests
To add tests for new data structures:
1. Create a new test file: `test_print_<structure_name>.cpp`
2. Follow the existing pattern using `CaptureStdout()`
3. Add the test executable to `CMakeLists.txt`

View File

@@ -0,0 +1,59 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "test_print_common.hpp"
#include "ck_tile/core/container/array.hpp"
#include "ck_tile/core/utility/print.hpp"
namespace ck_tile {
class PrintArrayTest : public PrintTest
{
};
TEST_F(PrintArrayTest, PrintIntArray)
{
// Test printing array<int, 3>
array<int, 3> arr{10, 20, 30};
std::string output = CapturePrintOutput(arr);
// The expected format should match the array print function implementation
EXPECT_EQ(output, "array{size: 3, data: [10, 20, 30]}");
}
TEST_F(PrintArrayTest, PrintSingleElementArray)
{
// Test printing array<int, 1>
array<int, 1> arr{42};
std::string output = CapturePrintOutput(arr);
EXPECT_EQ(output, "array{size: 1, data: [42]}");
}
TEST_F(PrintArrayTest, PrintEmptyArray)
{
// Test printing array<int, 0> (empty array)
array<int, 0> arr{};
std::string output = CapturePrintOutput(arr);
EXPECT_EQ(output, "array{size: 0, data: []}");
}
TEST_F(PrintArrayTest, PrintFloatArray)
{
// Test printing array with float values
array<float, 2> arr{3.14f, 2.71f};
std::string output = CapturePrintOutput(arr);
// Note: float printing format may vary, so we'll test for basic structure
EXPECT_TRUE(output.find("array{size: 2, data: [") == 0);
EXPECT_TRUE(output.find("3.14") != std::string::npos);
EXPECT_TRUE(output.find("2.71") != std::string::npos);
EXPECT_TRUE(output.find("]}") == output.length() - 2);
}
} // namespace ck_tile

View File

@@ -0,0 +1,76 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "test_print_common.hpp"
#include "ck_tile/core/utility/print.hpp"
namespace ck_tile {
class PrintBasicTypesTest : public PrintTest
{
};
TEST_F(PrintBasicTypesTest, PrintIntArray)
{
int arr[4] = {1, 2, 3, 4};
std::string output = CapturePrintOutput(arr);
EXPECT_EQ(output, "[1, 2, 3, 4]");
}
TEST_F(PrintBasicTypesTest, PrintFloatArray)
{
float arr[3] = {1.5f, 2.5f, 3.5f};
std::string output = CapturePrintOutput(arr);
// Note: floating point formatting may vary, so we check for key elements
EXPECT_TRUE(output.find("[") == 0);
EXPECT_TRUE(output.find("1.5") != std::string::npos);
EXPECT_TRUE(output.find("2.5") != std::string::npos);
EXPECT_TRUE(output.find("3.5") != std::string::npos);
EXPECT_TRUE(output.back() == ']');
EXPECT_TRUE(output.find(", ") != std::string::npos);
}
TEST_F(PrintBasicTypesTest, PrintDoubleArray)
{
double arr[2] = {10.123, 20.456};
std::string output = CapturePrintOutput(arr);
EXPECT_TRUE(output.find("[") == 0);
EXPECT_TRUE(output.find("10.123") != std::string::npos);
EXPECT_TRUE(output.find("20.456") != std::string::npos);
EXPECT_TRUE(output.back() == ']');
}
TEST_F(PrintBasicTypesTest, PrintUnsignedIntArray)
{
unsigned int arr[3] = {100u, 200u, 300u};
std::string output = CapturePrintOutput(arr);
EXPECT_EQ(output, "[100, 200, 300]");
}
TEST_F(PrintBasicTypesTest, PrintCharArray)
{
char arr[5] = {'a', 'b', 'c', 'd', 'e'};
std::string output = CapturePrintOutput(arr);
EXPECT_EQ(output, "[a, b, c, d, e]");
}
TEST_F(PrintBasicTypesTest, PrintSingleElementArray)
{
int arr[1] = {42};
std::string output = CapturePrintOutput(arr);
EXPECT_EQ(output, "[42]");
}
} // namespace ck_tile

View File

@@ -0,0 +1,78 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "test_print_common.hpp"
#include "ck_tile/core/tensor/buffer_view.hpp"
#include "ck_tile/core/utility/print.hpp"
namespace ck_tile {
class PrintBufferViewTest : public PrintTest
{
};
TEST_F(PrintBufferViewTest, PrintGenericBufferView)
{
// Test printing generic address space buffer_view
float data[4] = {100.f, 200.f, 300.f, 400.f};
auto bv = make_buffer_view<address_space_enum::generic>(&data, 4);
std::string output = CapturePrintOutput(bv);
// Verify the output contains expected information
EXPECT_TRUE(output.find("buffer_view{AddressSpace: generic") != std::string::npos);
EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
EXPECT_TRUE(output.find("}") != std::string::npos);
}
TEST_F(PrintBufferViewTest, PrintGlobalBufferView)
{
// Test printing global address space buffer_view
float data[4] = {100.f, 200.f, 300.f, 400.f};
auto bv = make_buffer_view<address_space_enum::global>(&data, 4);
std::string output = CapturePrintOutput(bv);
// Verify the output contains expected information
EXPECT_TRUE(output.find("buffer_view{AddressSpace: global") != std::string::npos);
EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
EXPECT_TRUE(output.find("}") != std::string::npos);
}
TEST_F(PrintBufferViewTest, PrintLdsBufferView)
{
// Test printing LDS address space buffer_view
float data[4] = {100.f, 200.f, 300.f, 400.f};
auto bv = make_buffer_view<address_space_enum::lds>(data, 4);
std::string output = CapturePrintOutput(bv);
// Verify the output contains expected information
EXPECT_TRUE(output.find("buffer_view{AddressSpace: lds") != std::string::npos);
EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
EXPECT_TRUE(output.find("}") != std::string::npos);
}
TEST_F(PrintBufferViewTest, PrintVgprBufferView)
{
// Test printing VGPR address space buffer_view
float data[4] = {1.5f, 2.5f, 3.5f, 4.5f};
auto bv = make_buffer_view<address_space_enum::vgpr>(data, 4);
std::string output = CapturePrintOutput(bv);
// Verify the output contains expected information
EXPECT_TRUE(output.find("buffer_view{AddressSpace: vgpr") != std::string::npos);
EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
EXPECT_TRUE(output.find("}") != std::string::npos);
}
} // namespace ck_tile

View File

@@ -0,0 +1,25 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <gtest/gtest.h>
#include <gtest/gtest-spi.h>
#include "ck_tile/core/utility/print.hpp"
class PrintTest : public ::testing::Test
{
protected:
void SetUp() override {}
void TearDown() override {}
// Helper function to capture and return the output of a print function
template <typename T>
std::string CapturePrintOutput(const T& type)
{
using namespace ck_tile;
testing::internal::CaptureStdout();
print(type);
return testing::internal::GetCapturedStdout();
}
};

View File

@@ -0,0 +1,83 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "test_print_common.hpp"
#include "ck_tile/core/algorithm/coordinate_transform.hpp"
#include "ck_tile/core/utility/print.hpp"
namespace ck_tile {
class PrintCoordinateTransformTest : public PrintTest
{
};
TEST_F(PrintCoordinateTransformTest, PrintPassThrough)
{
// Test printing pass_through transform
auto pt = make_pass_through_transform(number<32>{});
std::string output = CapturePrintOutput(pt);
// Verify it contains the pass_through identifier and some structure
EXPECT_TRUE(output.find("pass_through{") == 0);
EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
EXPECT_TRUE(output.back() == '}');
}
TEST_F(PrintCoordinateTransformTest, PrintEmbed)
{
// Test printing embed transform
auto embed_transform = make_embed_transform(make_tuple(number<4>{}, number<8>{}),
make_tuple(number<1>{}, number<4>{}));
std::string output = CapturePrintOutput(embed_transform);
// Verify it contains the embed identifier and key fields
EXPECT_TRUE(output.find("embed{") == 0);
EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
EXPECT_TRUE(output.find("coefficients_") != std::string::npos);
EXPECT_TRUE(output.back() == '}');
}
TEST_F(PrintCoordinateTransformTest, PrintMerge)
{
// Test printing merge transform
auto merge_transform = make_merge_transform(make_tuple(number<4>{}, number<8>{}));
std::string output = CapturePrintOutput(merge_transform);
// Verify it contains merge identifier and key fields
EXPECT_TRUE(output.find("merge") ==
0); // Could be merge_v2_magic_division or merge_v3_division_mod
EXPECT_TRUE(output.find("low_lengths_") != std::string::npos ||
output.find("up_lengths_") != std::string::npos);
EXPECT_TRUE(output.back() == '}');
}
TEST_F(PrintCoordinateTransformTest, PrintUnmerge)
{
// Test printing unmerge transform
auto unmerge_transform = make_unmerge_transform(make_tuple(number<4>{}, number<8>{}));
std::string output = CapturePrintOutput(unmerge_transform);
// Verify it contains the unmerge identifier and key fields
EXPECT_TRUE(output.find("unmerge{") == 0);
EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
EXPECT_TRUE(output.back() == '}');
}
TEST_F(PrintCoordinateTransformTest, PrintFreeze)
{
// Test printing freeze transform
auto freeze_transform = make_freeze_transform(number<5>{});
std::string output = CapturePrintOutput(freeze_transform);
// Verify it contains the freeze identifier and key fields
EXPECT_TRUE(output.find("freeze{") == 0);
EXPECT_TRUE(output.find("low_idx_") != std::string::npos);
EXPECT_TRUE(output.back() == '}');
}
} // namespace ck_tile

View File

@@ -0,0 +1,45 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "test_print_common.hpp"
#include "ck_tile/core/utility/print.hpp"
#include "ck_tile/core/container/sequence.hpp"
namespace ck_tile {
class PrintSequenceTest : public PrintTest
{
};
TEST_F(PrintSequenceTest, PrintSimpleSequence)
{
// Test printing sequence<1, 5, 8>
constexpr auto seq = sequence<1, 5, 8>{};
std::string output = CapturePrintOutput(seq);
// Verify the output format
EXPECT_EQ(output, "sequence<1, 5, 8>");
}
TEST_F(PrintSequenceTest, PrintSingleElementSequence)
{
// Test printing sequence<42>
constexpr auto seq = sequence<42>{};
std::string output = CapturePrintOutput(seq);
EXPECT_EQ(output, "sequence<42>");
}
TEST_F(PrintSequenceTest, PrintEmptySequence)
{
// Test printing sequence<> (empty sequence)
constexpr auto seq = sequence<>{};
std::string output = CapturePrintOutput(seq);
EXPECT_EQ(output, "sequence<>");
}
} // namespace ck_tile

View File

@@ -0,0 +1,89 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "test_print_common.hpp"
#include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
#include "ck_tile/core/utility/print.hpp"
#include <sstream>
namespace ck_tile {
class PrintStaticEncodingPatternTest : public PrintTest
{
protected:
void TestY0Y1Y2(const std::string& output, auto Y0, auto Y1, auto Y2)
{
std::stringstream expected;
expected << "<Y0, Y1, Y2>: <" << Y0 << ", " << Y1 << ", " << Y2 << ">";
EXPECT_TRUE(output.find(expected.str()) != std::string::npos);
}
void TestX0X1(const std::string& output, auto X0, auto X1)
{
std::stringstream expected;
expected << "<X0, X1>: <" << X0 << ", " << X1 << ">";
EXPECT_TRUE(output.find(expected.str()) != std::string::npos);
}
};
TEST_F(PrintStaticEncodingPatternTest, PrintThreadRakedPattern)
{
// Test printing thread raked pattern
using PatternType =
TileDistributionEncodingPattern2D<64, 8, 16, 4, tile_distribution_pattern::thread_raked>;
PatternType pattern;
std::string output = CapturePrintOutput(pattern);
// Verify the output contains expected information
EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
EXPECT_TRUE(output.find("BlockSize:64") != std::string::npos);
EXPECT_TRUE(output.find("YPerTile:8") != std::string::npos);
EXPECT_TRUE(output.find("XPerTile:16") != std::string::npos);
EXPECT_TRUE(output.find("VecSize:4") != std::string::npos);
EXPECT_TRUE(output.find("thread_raked") != std::string::npos);
TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
TestX0X1(output, PatternType::X0, PatternType::X1);
}
TEST_F(PrintStaticEncodingPatternTest, PrintWarpRakedPattern)
{
// Test printing warp raked pattern
using PatternType =
TileDistributionEncodingPattern2D<128, 16, 32, 8, tile_distribution_pattern::warp_raked>;
PatternType pattern;
std::string output = CapturePrintOutput(pattern);
// Verify the output contains expected information
EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
EXPECT_TRUE(output.find("BlockSize:128") != std::string::npos);
EXPECT_TRUE(output.find("YPerTile:16") != std::string::npos);
EXPECT_TRUE(output.find("XPerTile:32") != std::string::npos);
EXPECT_TRUE(output.find("VecSize:8") != std::string::npos);
EXPECT_TRUE(output.find("warp_raked") != std::string::npos);
TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
TestX0X1(output, PatternType::X0, PatternType::X1);
}
TEST_F(PrintStaticEncodingPatternTest, PrintBlockRakedPattern)
{
// Test printing block raked pattern
using PatternType =
TileDistributionEncodingPattern2D<256, 32, 64, 16, tile_distribution_pattern::block_raked>;
PatternType pattern;
std::string output = CapturePrintOutput(pattern);
// Verify the output contains expected information
EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
EXPECT_TRUE(output.find("BlockSize:256") != std::string::npos);
EXPECT_TRUE(output.find("YPerTile:32") != std::string::npos);
EXPECT_TRUE(output.find("XPerTile:64") != std::string::npos);
EXPECT_TRUE(output.find("VecSize:16") != std::string::npos);
EXPECT_TRUE(output.find("block_raked") != std::string::npos);
TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
TestX0X1(output, PatternType::X0, PatternType::X1);
}
} // namespace ck_tile

View File

@@ -0,0 +1,66 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "test_print_common.hpp"
#include "ck_tile/core/container/tuple.hpp"
#include "ck_tile/core/numeric/integral_constant.hpp"
#include "ck_tile/core/utility/print.hpp"
namespace ck_tile {
class PrintTupleTest : public PrintTest
{
};
TEST_F(PrintTupleTest, PrintSimpleTuple)
{
// Test printing tuple with numbers
auto tup = make_tuple(number<1>{}, number<5>{}, number<8>{});
std::string output = CapturePrintOutput(tup);
// Verify the output format matches tuple print implementation
EXPECT_TRUE(output.find("tuple<") == 0);
EXPECT_TRUE(output.find("1") != std::string::npos);
EXPECT_TRUE(output.find("5") != std::string::npos);
EXPECT_TRUE(output.find("8") != std::string::npos);
EXPECT_TRUE(output.back() == '>');
}
TEST_F(PrintTupleTest, PrintSingleElementTuple)
{
// Test printing tuple with single element
auto tup = make_tuple(number<42>{});
std::string output = CapturePrintOutput(tup);
EXPECT_TRUE(output.find("tuple<") == 0);
EXPECT_TRUE(output.find("42") != std::string::npos);
EXPECT_TRUE(output.back() == '>');
}
TEST_F(PrintTupleTest, PrintEmptyTuple)
{
// Test printing empty tuple
auto tup = make_tuple();
std::string output = CapturePrintOutput(tup);
EXPECT_EQ(output, "tuple<>");
}
TEST_F(PrintTupleTest, PrintMixedTypeTuple)
{
// Test printing tuple with mixed types (numbers and constants)
auto tup = make_tuple(number<10>{}, constant<20>{}, number<30>{});
std::string output = CapturePrintOutput(tup);
EXPECT_TRUE(output.find("tuple<") == 0);
EXPECT_TRUE(output.find("10") != std::string::npos);
EXPECT_TRUE(output.find("20") != std::string::npos);
EXPECT_TRUE(output.find("30") != std::string::npos);
EXPECT_TRUE(output.back() == '>');
}
} // namespace ck_tile

View File

@@ -1,4 +1,10 @@
if(GPU_TARGETS MATCHES "gfx9")
add_gtest_executable(test_grouped_convnd_fwd_bias_bnorm_clamp test_grouped_convnd_fwd_bias_bnorm_clamp.cpp)
target_link_libraries(test_grouped_convnd_fwd_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
add_gtest_executable(test_grouped_convnd_fwd_gk_bias_bnorm_clamp test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp)
target_link_libraries(test_grouped_convnd_fwd_gk_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)

View File

@@ -0,0 +1,97 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
template <typename Tuple>
class TestGroupedConvndFwd : public ::testing::Test
{
protected:
using DataType = std::tuple_element_t<0, Tuple>;
using InLayout = std::tuple_element_t<1, Tuple>;
using WeiLayout = std::tuple_element_t<2, Tuple>;
using OutLayout = std::tuple_element_t<3, Tuple>;
using IndexType = ck::index_t;
std::vector<ck::utils::conv::ConvParam> conv_params;
template <ck::index_t NDimSpatial>
void Run()
{
EXPECT_FALSE(conv_params.empty());
bool pass = true;
for(auto& param : conv_params)
{
pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
DataType,
DataType,
DataType,
DataType,
DataType,
IndexType,
false /*BiasGK*/>(
true, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel
param);
}
EXPECT_TRUE(pass);
}
};
using namespace ck::tensor_layout::convolution;
using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
std::tuple<float, NHWGC, GKYXC, NHWGK>,
std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
template <typename Tuple>
class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
{
};
template <typename Tuple>
class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
{
};
TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
{
this->conv_params.clear();
this->conv_params.push_back(
{2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back(
{2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->template Run<2>();
}
TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->template Run<3>();
}

View File

@@ -0,0 +1,98 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
template <typename Tuple>
class TestGroupedConvndFwd : public ::testing::Test
{
protected:
using DataType = std::tuple_element_t<0, Tuple>;
using InLayout = std::tuple_element_t<1, Tuple>;
using WeiLayout = std::tuple_element_t<2, Tuple>;
using OutLayout = std::tuple_element_t<3, Tuple>;
using IndexType = ck::index_t;
std::vector<ck::utils::conv::ConvParam> conv_params;
template <ck::index_t NDimSpatial>
void Run()
{
EXPECT_FALSE(conv_params.empty());
bool pass = true;
for(auto& param : conv_params)
{
pass = pass &&
ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
DataType,
DataType,
DataType,
DataType,
DataType,
IndexType,
true /*ElementwiseGK*/>(
true, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel
param);
}
EXPECT_TRUE(pass);
}
};
using namespace ck::tensor_layout::convolution;
using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
std::tuple<float, NHWGC, GKYXC, NHWGK>,
std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
template <typename Tuple>
class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
{
};
template <typename Tuple>
class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
{
};
TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
{
this->conv_params.clear();
this->conv_params.push_back(
{2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back(
{2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->template Run<2>();
}
TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->template Run<3>();
}

View File

@@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr)
auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M);
auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M);
using ARawT = typename scalar_type<AFragT>::type;
using AScalarFragT =
vector_type<ARawT,
BLOCK_M * BLOCK_K / WAVE_SIZE /
(ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
using ARawT = typename scalar_type<AFragT>::type;
using AScalarFragT = typename vector_type<
ARawT,
BLOCK_M * BLOCK_K / WAVE_SIZE /
(ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
AScalarFragT fragA{};
@@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
// Flatten to 1D row_major offsets.
auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
using ARawT = typename scalar_type<AFragT>::type;
using AScalarChunkT = vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
using ARawT = typename scalar_type<AFragT>::type;
using AScalarChunkT =
typename vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
union
{
@@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
using BRawT = typename scalar_type<BFragT>::type;
using BScalarChunkT = vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
using BRawT = typename scalar_type<BFragT>::type;
using BScalarChunkT =
typename vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
union
{
@@ -780,7 +782,7 @@ struct store_C_col_major<CType, CFragT, 32, 32>
// we can vector store 4 contiguous elements at a time.
using CRawT = typename scalar_type<CFragT>::type;
using CScalarFragT = vector_type<CRawT, VW>::type;
using CScalarFragT = typename vector_type<CRawT, VW>::type;
union
{
CFragT frag;
@@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t<AType>* a, const packed_type_t<BType>
assert(threadIdx.x < WAVE_SIZE);
assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using AFragT =
typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT =
typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using CFragT = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using CFragT = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
// Create frags
auto fragA = AFragT{};
@@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t<AType>* a,
assert(threadIdx.x < WAVE_SIZE);
assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using AFragT =
typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT =
typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using CFragT = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using CFragT = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AScaleFragT = vector_type<ScaleType, 1>::type;
using BScaleFragT = vector_type<ScaleType, 1>::type;
using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AScaleFragT = typename vector_type<ScaleType, 1>::type;
using BScaleFragT = typename vector_type<ScaleType, 1>::type;
// Create frags
auto fragA = AFragT{};