mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 21:21:22 +00:00
[rocm-libraries] ROCm/rocm-libraries#4797 (commit 1a30400)
[CK_TILE] Add CK Tile bwd weight profiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation To compare old CK and CK Tile, we need to extend the current CK profiler to support running also CK Tile instance with the same API. In order to have the same instance coverage in CK Tile compared to the old CK, I've added code generation from old CK configurations to CK Tile instances using the CK Builder. ## Technical Details - The codegen python script for CK Tile fwd convs is extended to support also bwd weight and bwd data. - The generated instances are added to the CMake build (target `device_grouped_conv_bwd_weight_tile_instance`s). - A new profiler op (`grouped_conv_bwd_weight_tile`) has been added to the CK Profiler.
This commit is contained in:
committed by
assistant-librarian[bot]
parent
fc1e1a5155
commit
ae4e632c7d
@@ -150,7 +150,7 @@ struct tile_distribution_encoding_pattern_2d<BlockSize,
|
||||
static constexpr index_t num_warps = BlockSize / get_warp_size();
|
||||
static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
|
||||
static constexpr index_t X1 = VecSize > LargestVec ? LargestVec : VecSize;
|
||||
static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim
|
||||
static constexpr index_t X0 = min(warp_size, XPerTile / X1); // # of threads in X dim
|
||||
|
||||
// # of rows in Y dim accessed by single wavefront in one iteration
|
||||
static constexpr index_t Y1 = warp_size / X0;
|
||||
@@ -234,7 +234,7 @@ struct tile_distribution_encoding_pattern_2d<BlockSize,
|
||||
static constexpr index_t num_warps = BlockSize / get_warp_size();
|
||||
static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
|
||||
static constexpr index_t X1 = VecSize > LargestVec ? LargestVec : VecSize;
|
||||
static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim
|
||||
static constexpr index_t X0 = min(warp_size, XPerTile / X1); // # of threads in X dim
|
||||
|
||||
static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
|
||||
static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
|
||||
@@ -289,7 +289,7 @@ struct tile_distribution_encoding_pattern_2d<BlockSize,
|
||||
static constexpr index_t num_warps = BlockSize / get_warp_size();
|
||||
static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
|
||||
static constexpr index_t X1 = VecSize > LargestVec ? LargestVec : VecSize;
|
||||
static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim
|
||||
static constexpr index_t X0 = min(warp_size, XPerTile / X1); // # of threads in X dim
|
||||
static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
|
||||
static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
|
||||
static constexpr index_t Y1 = num_warps;
|
||||
|
||||
@@ -16,7 +16,15 @@ void CK_TILE_ERROR(Args&&... args) noexcept
|
||||
{
|
||||
std::ostringstream oss;
|
||||
(oss << ... << args);
|
||||
std::cerr << "[ERROR] " << oss.str() << std::endl;
|
||||
std::cerr << "[CK_TILE_ERROR] " << oss.str() << std::endl;
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void CK_TILE_INFO(Args&&... args) noexcept
|
||||
{
|
||||
std::ostringstream oss;
|
||||
(oss << ... << args);
|
||||
std::cout << "[CK_TILE_INFO] " << oss.str() << std::endl;
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
Reference in New Issue
Block a user