[Regression] Fix CK_TILE build error in grouped_convolution, copy_basic and fused_moegemm_kernel (#2728)

* fix copy basic build error

* fix other ck tile test build error
This commit is contained in:
linqunAMD
2025-08-28 20:30:30 +08:00
committed by GitHub
parent ead4447b20
commit 4a49dac7c6
6 changed files with 30 additions and 16 deletions

View File

@@ -77,10 +77,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
// we intentionally do not use pipeline for this example and let the kernel be composite of
// Problem and Policy
constexpr ck_tile::index_t kBlockSize = Shape::BlockSize;
auto blockSize = Kernel::BlockSize();
// Print configuration information
std::cout << "block size (number of threads per block) " << kBlockSize << std::endl;
std::cout << "block size (number of threads per block) " << blockSize << std::endl;
std::cout << "wave size (number of threads per wave) " << ck_tile::get_warp_size() << std::endl;
std::cout << "block waves (number of waves per block) " << BlockWaves::at(ck_tile::number<0>{})
<< " " << BlockWaves::at(ck_tile::number<1>{}) << std::endl;
@@ -103,7 +103,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
launch_kernel(ck_tile::stream_config{nullptr, true, warmup, repeat, 1},
ck_tile::make_kernel<1>(Kernel{},
kGridSize,
kBlockSize,
blockSize,
0,
static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_buf.GetDeviceBuffer()),

View File

@@ -27,8 +27,9 @@ struct TileCopyShape
static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{});
// Wave tile dimensions
static constexpr index_t Wave_Tile_M = WaveTile::at(number<0>{});
static constexpr index_t WaveSize = get_warp_size();
static constexpr index_t Wave_Tile_N = WaveTile::at(number<1>{});
static constexpr index_t Wave_Tile_M = ThreadTile_M * ThreadTile_N * WaveSize / Wave_Tile_N;
// Block tile dimensions
static constexpr index_t Block_Tile_M = BlockTile::at(number<0>{});
@@ -45,7 +46,6 @@ struct TileCopyShape
Block_Tile_N / (Waves_Per_Block_N * Wave_Tile_N);
// Hardware configuration
static constexpr index_t WaveSize = get_warp_size();
static constexpr index_t BlockSize = Waves_Per_Block_M * Waves_Per_Block_N * WaveSize;
// Configuration validation
@@ -60,8 +60,10 @@ struct TileCopyShape
"Invalid wave configuration for N dimension");
// Ensure wave tile dimensions align with wave size
#if defined(__HIP_DEVICE_COMPILE__)
static_assert(Wave_Tile_M / ThreadTile_M * Wave_Tile_N / ThreadTile_N == WaveSize,
"(Wave_Tile_M/ThreadTile_M) * (Wave_Tile_N/ThreadTile_N) != WaveSize");
#endif
};
/**
@@ -200,6 +202,19 @@ struct ElementWiseTileCopyKernel
using XDataType = typename Problem::XDataType;
using Policy = ck_tile::remove_cvref_t<Policy_>;
static constexpr index_t kBlockSize = Problem::BlockShape::BlockSize;
CK_TILE_HOST static auto BlockSize()
{
if(ck_tile::is_wave32())
{
return kBlockSize / 2;
}
else
{
return kBlockSize;
}
}
CK_TILE_DEVICE void operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N) const
{
using S = typename Problem::BlockShape;