mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 04:31:25 +00:00
[CK_TILE] Remove Old CK Tile Stream-K Artifacts (#3202)
* Remove old CK Tile Stream-K implementation The original CK Stream-K implementation was based on old CK's Stream-K block to C tile map. However, this implementation did not align with the original Stream-K paper. Thus, we implemented a new tile partitioner and associated Stream-K kernel, which was placed in the reboot namespace. Now that the new Stream-K implementation is ready, this change removes all artifacts of the old implementation. Specifically, the following changes were made: - Removes old Stream-K tile partitioner from CK Tile - Removes the reboot namespace such that the new implementation resides in the ck_tile namespace only. - Adds tests for bf8 and fp8 using the new implementation - Removes tests for the old implementation - Remove the v2 suffix from the new CK Tile Tile Partitioner derived classes. - Updates Stream-K Kernel ops file to use /** commenting style. * Remove v2 from tile partitioner validation function names
This commit is contained in:
@@ -11,33 +11,4 @@ enum StreamKReductionStrategy : uint32_t
|
||||
Atomic = 0u,
|
||||
Reduction = 1u
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Estimates the number of Stream-K workgroups per macro tile in the C tensor.
|
||||
*
|
||||
* @param sk_ctas Number of Stream-K workgroups.
|
||||
* @param iters_per_sk_cta Number of iterations per Stream-K workgroup.
|
||||
* @param iters_per_tile Number of iterations per tile (i.e., the number of macro tiles in the K
|
||||
* dimension).
|
||||
* @return ck_tile::index_t An estimate of the number of workgroups per macro tile in the C tensor.
|
||||
* @note It is assumed that `iters_per_sk_cta` > 0.
|
||||
*/
|
||||
template <ck_tile::StreamKReductionStrategy ReductionStrategy>
|
||||
ck_tile::index_t
|
||||
estimate_num_wgs_per_tile(index_t sk_ctas, index_t iters_per_sk_cta, index_t iters_per_tile)
|
||||
{
|
||||
// In the case of non-atomic reduction or data-parallel only, there will always be 1 workgroup
|
||||
// writing final results to a given macro tile in C.
|
||||
int num_wgs_per_tile = 1;
|
||||
|
||||
// Otherwise, for atomics, multiple workgroups may be writing to the same macro tile in C.
|
||||
if(sk_ctas > 0 && ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
|
||||
{
|
||||
// Estimate the number of workgroups per macro tile.
|
||||
num_wgs_per_tile =
|
||||
(iters_per_tile / iters_per_sk_cta) + ((iters_per_tile % iters_per_sk_cta) != 0);
|
||||
}
|
||||
|
||||
return std::max(num_wgs_per_tile, 1);
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
Reference in New Issue
Block a user