mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[CK_TILE] Fix alignment in Stream-K workspace buffer (#3625)
* Fix alignment issue in Stream-K workspace buffer In CK Tile Stream-K, the workspace buffer is used to hold flags and partials, where the first i bytes holds the flags and the remaining bytes hold partials. This change adds padding to the flags prefix of the workspace buffer to ensure the number of bytes is 128B-aligned. Without this alignment, since workgroups do not skip cache when reading from partials, they may read stale partials data in cache, leading to incorrect results. The added padding avoids the stale data reading. This change also re-enables the test_ck_tile_streamk_reduction tests. * Compute reference GEMM on GPU for test verification to decrease testing time
This commit is contained in:
@@ -42,7 +42,8 @@ struct StreamKTilePartitionerBase
|
||||
CK_TILE_HOST_DEVICE index_t get_partials_buffer_size(index_t acc_element_bytes) const noexcept;
|
||||
|
||||
/**
|
||||
* @brief Calculates the total space needed for the flags buffer.
|
||||
* @brief Calculates the total space needed for the flags buffer whose total byte size is
|
||||
* 128B-aligned.
|
||||
*
|
||||
* @return index_t The number of bytes needed for the flags buffer.
|
||||
*/
|
||||
|
||||
@@ -58,7 +58,10 @@ CK_TILE_HOST_DEVICE index_t
|
||||
StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_flags_buffer_size()
|
||||
const noexcept
|
||||
{
|
||||
return sizeof(index_t) * sk_ctas_;
|
||||
constexpr index_t alignment = 128;
|
||||
const index_t required_bytes = sizeof(index_t) * sk_ctas_;
|
||||
const index_t padded_bytes = ck_tile::integer_least_multiple(required_bytes, alignment);
|
||||
return padded_bytes;
|
||||
}
|
||||
|
||||
template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
|
||||
|
||||
Reference in New Issue
Block a user