[rocm-libraries] ROCm/rocm-libraries#4797 (commit 1a30400)

[CK_TILE] Add CK Tile bwd weight profiler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Motivation

To compare old CK and CK Tile, we need to extend the current CK profiler
to support running also CK Tile instance with the same API. In order to
have the same instance coverage in CK Tile compared to the old CK, I've
added code generation from old CK configurations to CK Tile instances
using the CK Builder.

## Technical Details

- The codegen python script for CK Tile fwd convs is extended to support
also bwd weight and bwd data.
- The generated instances are added to the CMake build (target
`device_grouped_conv_bwd_weight_tile_instance`s).
- A new profiler op (`grouped_conv_bwd_weight_tile`) has been added to
the CK Profiler.
This commit is contained in:
Ville Pietilä
2026-03-04 21:50:29 +00:00
committed by assistant-librarian[bot]
parent fc1e1a5155
commit ae4e632c7d
68 changed files with 5194 additions and 196 deletions

View File

@@ -22,6 +22,15 @@
namespace ck_tile {
template <typename... Args>
CK_TILE_HOST void LogInfo(Args&&... args) noexcept
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_INFO(std::forward<Args>(args)...);
}
}
/// @brief The Grouped Convolution kernel device arguments.
template <typename GroupedConvTraitsType_>
struct GroupedConvBwdWeightKernelArgs
@@ -106,13 +115,18 @@ struct GroupedConvBwdWeightKernelArgs
k_batch = args.k_batch;
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
std::cout << "GemmM: " << GemmM << ", GemmN: " << GemmN << ", GemmK: " << GemmK
<< ", GemmBatch: " << GemmBatch
<< ", NumGroupsPerBatch: " << NumGroupsPerBatch << ", k_batch: " << k_batch
<< std::endl;
}
LogInfo("GemmM: ",
GemmM,
", GemmN: ",
GemmN,
", GemmK: ",
GemmK,
", GemmBatch: ",
GemmBatch,
", NumGroupsPerBatch: ",
NumGroupsPerBatch,
", k_batch: ",
k_batch);
}
template <
@@ -192,13 +206,18 @@ struct GroupedConvBwdWeightKernelArgs
k_batch = args.k_batch;
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
std::cout << "GemmM: " << GemmM << ", GemmN: " << GemmN << ", GemmK: " << GemmK
<< ", GemmBatch: " << GemmBatch
<< ", NumGroupsPerBatch: " << NumGroupsPerBatch << ", k_batch: " << k_batch
<< std::endl;
}
LogInfo("GemmM: ",
GemmM,
", GemmN: ",
GemmN,
", GemmK: ",
GemmK,
", GemmBatch: ",
GemmBatch,
", NumGroupsPerBatch: ",
NumGroupsPerBatch,
", k_batch: ",
k_batch);
}
template <
@@ -285,13 +304,18 @@ struct GroupedConvBwdWeightKernelArgs
k_batch = args.k_batch;
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
std::cout << "GemmM: " << GemmM << ", GemmN: " << GemmN << ", GemmK: " << GemmK
<< ", GemmBatch: " << GemmBatch
<< ", NumGroupsPerBatch: " << NumGroupsPerBatch << ", k_batch: " << k_batch
<< std::endl;
}
LogInfo("GemmM: ",
GemmM,
", GemmN: ",
GemmN,
", GemmK: ",
GemmK,
", GemmBatch: ",
GemmBatch,
", NumGroupsPerBatch: ",
NumGroupsPerBatch,
", k_batch: ",
k_batch);
}
using ABCGridDescs = remove_cvref_t<
@@ -474,12 +498,12 @@ struct GroupedConvolutionBackwardWeightKernel
CK_TILE_HOST static constexpr GroupedConvBwdWeightKernelArgsSpecialized
MakeKernelArgs(const GroupedConvBwdWeightHostArgs& hostArgs)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
std::cout << "MPerBlock: " << number<TilePartitioner::MPerBlock>{} << std::endl;
std::cout << "NPerBlock: " << number<TilePartitioner::NPerBlock>{} << std::endl;
std::cout << "KPerBlock: " << number<TilePartitioner::KPerBlock>{} << std::endl;
}
LogInfo("MPerBlock: ",
number<TilePartitioner::MPerBlock>{},
", NPerBlock: ",
number<TilePartitioner::NPerBlock>{},
", KPerBlock: ",
number<TilePartitioner::KPerBlock>{});
auto kernel_args = GroupedConvBwdWeightKernelArgsSpecialized(hostArgs);
@@ -517,11 +541,7 @@ struct GroupedConvolutionBackwardWeightKernel
}
if(kargs.k_batch < 1)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_ERROR(
"k_batch must be at least one. Ensure argument is created via MakeKernelArgs.");
}
LogInfo("k_batch must be at least one. Ensure argument is created via MakeKernelArgs.");
return false;
}
@@ -533,12 +553,8 @@ struct GroupedConvolutionBackwardWeightKernel
// accuracy issues. Hence, we limit the maximum split-K value to 128 in such cases.
if(kargs.k_batch > 128)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_ERROR(
"For epilogue output data type that is not float/double, we must have "
LogInfo("For epilogue output data type that is not float/double, we must have "
"k_batch <= 128.");
}
return false;
}
}
@@ -548,20 +564,24 @@ struct GroupedConvolutionBackwardWeightKernel
{
if(kargs.k_batch != 1)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_ERROR("Conditions not met for K_batch > 1!");
}
LogInfo("Conditions not met for K_batch > 1: VectorSizeC must be a multiple of 2 "
"for fp16/bf16 when K_batch > 1.",
"Now k_batch is ",
kargs.k_batch,
", VectorSizeC is ",
GroupedConvTraitsType_::VectorSizeC);
return false;
}
}
if(kargs.GemmK < TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_ERROR("KBatch is too large, part of GPU wouldn't be utilized!");
}
LogInfo("KBatch is too large, part of GPU wouldn't be utilized! GemmK: ",
kargs.GemmK,
", BlockGemmShape K: ",
TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}),
", k_batch: ",
kargs.k_batch);
return false;
}
@@ -581,6 +601,17 @@ struct GroupedConvolutionBackwardWeightKernel
if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
{
LogInfo("For Filter1x1Stride1Pad0 specialization, all spatial dimensions must "
"be 1, stride must be 1, and padding must be 0. Now for dimension ",
i,
": SpatialDim is ",
SpatialDim,
", ConvStride is ",
ConvStride,
", LeftPad is ",
LeftPad,
", RightPad is ",
RightPad);
return false;
}
}
@@ -596,6 +627,15 @@ struct GroupedConvolutionBackwardWeightKernel
if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
{
LogInfo("For Filter1x1Pad0 specialization, all spatial dimensions must be 1 "
"and padding must be 0. Now for dimension ",
i,
": SpatialDim is ",
SpatialDim,
", LeftPad is ",
LeftPad,
", RightPad is ",
RightPad);
return false;
}
}
@@ -604,6 +644,7 @@ struct GroupedConvolutionBackwardWeightKernel
{
if(ConvC != 1)
{
LogInfo("For Filter3x3 specialization, ConvC must be 1. Now ConvC is ", ConvC);
return false;
}
for(index_t i = 0; i < NDimSpatial; ++i)
@@ -612,6 +653,11 @@ struct GroupedConvolutionBackwardWeightKernel
if(filter_spatial_dim != I3)
{
LogInfo("For Filter3x3 specialization, all spatial dimensions of the filter "
"must be 3. Now for dimension ",
i,
", filter_spatial_dim is ",
filter_spatial_dim);
return false;
}
}
@@ -620,8 +666,7 @@ struct GroupedConvolutionBackwardWeightKernel
if constexpr(GroupedConvTraitsType_::ExplicitGemm &&
ConvSpecialization != ConvolutionSpecialization::Filter1x1Stride1Pad0)
{
CK_TILE_ERROR(
"Explicit Gemm is supported only for Filter1x1Stride1Pad0 specialization!");
LogInfo("ExplicitGemm is only supported for Filter1x1Stride1Pad0 specialization.");
return false;
}
@@ -633,14 +678,16 @@ struct GroupedConvolutionBackwardWeightKernel
// Check access per C
if(ConvC % GroupedConvTraitsType_::VectorSizeB != 0)
{
CK_TILE_ERROR("Conv C is not a multiple of vector load size for "
"input image!");
LogInfo("Conv C is not a multiple of vector load size for input! ConvC: ",
ConvC,
", VectorSizeB: ",
GroupedConvTraitsType_::VectorSizeB);
return false;
}
}
else
{
CK_TILE_ERROR("Not supported input layout!");
LogInfo("Not supported input layout! Now InLayout is ", InLayout::name);
return false;
}
@@ -650,13 +697,16 @@ struct GroupedConvolutionBackwardWeightKernel
{
if(ConvC % GroupedConvTraitsType_::VectorSizeC != 0)
{
CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
LogInfo("Conv C is not a multiple of vector load size for weight! ConvC: ",
ConvC,
", VectorSizeC: ",
GroupedConvTraitsType_::VectorSizeC);
return false;
}
}
else
{
CK_TILE_ERROR("Not supported weight layout!");
LogInfo("Not supported weight layout! Now WeiLayout is ", WeiLayout::name);
return false;
}
@@ -666,14 +716,16 @@ struct GroupedConvolutionBackwardWeightKernel
{
if(ConvK % GroupedConvTraitsType_::VectorSizeA != 0)
{
CK_TILE_ERROR("Conv K is not a multiple of vector store size "
"for output image!");
LogInfo("Conv K is not a multiple of vector load size for output! ConvK: ",
ConvK,
", VectorSizeA: ",
GroupedConvTraitsType_::VectorSizeA);
return false;
}
}
else
{
CK_TILE_ERROR("Not supported output layout!");
LogInfo("Not supported output layout! Now OutLayout is ", OutLayout::name);
return false;
}
@@ -682,7 +734,10 @@ struct GroupedConvolutionBackwardWeightKernel
const index_t ConvG = kargs.wei_g_k_c_xs_lengths[number<0>{}];
if(ConvG % GroupedConvTraitsType_::NumGroupsToMerge != 0)
{
CK_TILE_ERROR("ConvG must be a multiple of NumGroupsToMerge!");
LogInfo("Number of groups must be divisible by NumGroupsToMerge! ConvG: ",
ConvG,
", NumGroupsToMerge: ",
GroupedConvTraitsType_::NumGroupsToMerge);
return false;
}
}