mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[rocm-libraries] ROCm/rocm-libraries#4756 (commit 79bc2ca)
[CK_TILE] Update Stream-K Reduction Strategy Enum
## Motivation
Currently, Stream-K has 3 reduction options: 1) atomics, 2) The
reduction described in the Stream-K paper, and 3) a tree reduction. The
reduction strategy described in the original Stream-K paper has the
starting workgroup of each tile sequentially accumulating partial
results of other contributing workgroups in the tile, which requires a
linear number of steps. Hence, for clarity, this works updates the
naming of the `StreamKReductionStrategy` enum members to better describe
the existing reduction strategy options.
## Technical Details
Prior to this change, the enum is as follows:
```cpp
enum StreamKReductionStrategy : uint32_t
{
Atomic = 0u,
Reduction = 1u,
TreeReduction = 2u
};
```
But, the distinction between `Reduction` and `TreeReduction` is not very
clear and has some redundancy.
Hence, the updated enum is as follows:
```cpp
enum StreamKReductionStrategy : uint32_t
{
Atomic = 0u,
Linear = 1u,
Tree = 2u
};
```
All references to `StreamKReductionStrategy` were updated to reflect
this change.
## Test Plan
No new functionality was added, so no new tests were added; I just
validated existing tests and examples.
## Test Result
All tests passed locally.
## Submission Checklist
- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
committed by
assistant-librarian[bot]
parent
6aa1cd8212
commit
fc3180120e
@@ -22,7 +22,7 @@ args:
|
||||
-a_layout tensor A data layout (default: R)
|
||||
-b_layout tensor B data layout (default: C)
|
||||
-c_layout tensor C data layout (default: R)
|
||||
-reduction_strategy strategy for storing results in C tensor. atomic/reduction (default:atomic)
|
||||
-reduction_strategy strategy for storing results in C tensor. atomic/linear (default:atomic)
|
||||
-persistent_dp persistent strategy for data-parallel section. Set to 0 for non-persistent or to 1 for persistent. (default:0)
|
||||
-stride_a tensor A stride (default:0)
|
||||
-stride_b tensor B stride (default:0)
|
||||
|
||||
@@ -65,7 +65,7 @@ auto createArgs(int argc, char* argv[])
|
||||
.insert("c_layout", "R", "C tensor data layout - Row by default")
|
||||
.insert("reduction_strategy",
|
||||
"atomic",
|
||||
"strategy for storing results in C tensor - atomic/reduction")
|
||||
"strategy for storing results in C tensor - atomic/linear")
|
||||
.insert("persistent_dp",
|
||||
"0",
|
||||
"0. Non-persistent data-parallel section, 1 Fully persistent kernel.")
|
||||
|
||||
@@ -119,7 +119,7 @@ std::tuple<float, ck_tile::index_t> invokeGemm(ck_tile::DeviceMem& a_m_k_device_
|
||||
DsLayout,
|
||||
CLayout,
|
||||
CDEElementWise,
|
||||
ck_tile::StreamKReductionStrategy::Reduction>(
|
||||
ck_tile::StreamKReductionStrategy::Linear>(
|
||||
args,
|
||||
ck_tile::stream_config{
|
||||
nullptr, true, 1, warmup_iterations, repeat_iterations, true, flush_cache});
|
||||
@@ -155,9 +155,9 @@ ck_tile::StreamKReductionStrategy getReductionStrategyValue(const std::string& s
|
||||
{
|
||||
return ck_tile::StreamKReductionStrategy::Atomic;
|
||||
}
|
||||
else if(strategy == "reduction")
|
||||
else if(strategy == "linear")
|
||||
{
|
||||
return ck_tile::StreamKReductionStrategy::Reduction;
|
||||
return ck_tile::StreamKReductionStrategy::Linear;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -112,7 +112,7 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream_config.stream_id_));
|
||||
}
|
||||
else if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)
|
||||
else if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Linear)
|
||||
{
|
||||
// Reset sk flags to zero before each repetition of the kernel
|
||||
workspace_data.SetZero();
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
namespace ck_tile {
|
||||
enum StreamKReductionStrategy : uint32_t
|
||||
{
|
||||
Atomic = 0u,
|
||||
Reduction = 1u,
|
||||
TreeReduction = 2u
|
||||
Atomic = 0u,
|
||||
Linear = 1u,
|
||||
Tree = 2u
|
||||
};
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -508,8 +508,8 @@ struct StreamKKernel
|
||||
{
|
||||
BaseGemm(kargs, tile_idx, num_loop_sk, i_k_a, i_k_b, k_size, smem_ptr_0);
|
||||
}
|
||||
else if(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Reduction ||
|
||||
TilePartitioner::ReductionStrategy == StreamKReductionStrategy::TreeReduction)
|
||||
else if(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Linear ||
|
||||
TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Tree)
|
||||
{
|
||||
const auto c_macro_tile_idx =
|
||||
kargs.tile_partitioner.get_output_tile_index(tile_idx);
|
||||
@@ -548,8 +548,7 @@ struct StreamKKernel
|
||||
auto tile_started = iter_start == tile_iter_start;
|
||||
auto tile_ended = iter_end >= tile_iter_end;
|
||||
|
||||
if constexpr(TilePartitioner::ReductionStrategy ==
|
||||
StreamKReductionStrategy::Reduction)
|
||||
if constexpr(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Linear)
|
||||
{
|
||||
if(!tile_started)
|
||||
{
|
||||
|
||||
@@ -153,8 +153,8 @@ CK_TILE_HOST_DEVICE index_t
|
||||
StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_workspace_size(
|
||||
index_t acc_element_bytes) const noexcept
|
||||
{
|
||||
if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction ||
|
||||
ReductionStrategy == StreamKReductionStrategy::TreeReduction)
|
||||
if constexpr(ReductionStrategy == StreamKReductionStrategy::Linear ||
|
||||
ReductionStrategy == StreamKReductionStrategy::Tree)
|
||||
{
|
||||
|
||||
return get_partials_buffer_size(acc_element_bytes) + get_flags_buffer_size();
|
||||
|
||||
@@ -247,17 +247,15 @@ class TestCkTileStreamK : public ::testing::Test
|
||||
num_accumulations_per_tile = invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
|
||||
args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
|
||||
}
|
||||
else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction)
|
||||
else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Linear)
|
||||
{
|
||||
num_accumulations_per_tile =
|
||||
invoke_streamk<ck_tile::StreamKReductionStrategy::Reduction>(
|
||||
args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
|
||||
num_accumulations_per_tile = invoke_streamk<ck_tile::StreamKReductionStrategy::Linear>(
|
||||
args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
|
||||
}
|
||||
else
|
||||
{
|
||||
num_accumulations_per_tile =
|
||||
invoke_streamk<ck_tile::StreamKReductionStrategy::TreeReduction>(
|
||||
args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
|
||||
num_accumulations_per_tile = invoke_streamk<ck_tile::StreamKReductionStrategy::Tree>(
|
||||
args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
|
||||
}
|
||||
|
||||
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
|
||||
|
||||
@@ -56,7 +56,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsLessThan128Bytes)
|
||||
using Config = StreamKTilePartitionerBaseConfigDP2TileSK;
|
||||
|
||||
ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
|
||||
ck_tile::StreamKReductionStrategy::Reduction>
|
||||
ck_tile::StreamKReductionStrategy::Linear>
|
||||
tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
|
||||
|
||||
EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 128);
|
||||
@@ -67,7 +67,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsEqual128Bytes)
|
||||
using Config = StreamKTilePartitionerBaseConfigFlagsSizeEqual128Bytes;
|
||||
|
||||
ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
|
||||
ck_tile::StreamKReductionStrategy::Reduction>
|
||||
ck_tile::StreamKReductionStrategy::Linear>
|
||||
tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
|
||||
|
||||
EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 128);
|
||||
@@ -78,7 +78,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsGreaterThan128Bytes)
|
||||
using Config = StreamKTilePartitionerBaseConfigFlagsSizeGreaterThan128Bytes;
|
||||
|
||||
ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
|
||||
ck_tile::StreamKReductionStrategy::Reduction>
|
||||
ck_tile::StreamKReductionStrategy::Linear>
|
||||
tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
|
||||
|
||||
EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 256);
|
||||
@@ -99,7 +99,7 @@ TEST(StreamKTilePartitionerBaseGetWorkSpaceSize, ReductionStrategy)
|
||||
using Config = StreamKTilePartitionerBaseConfigDP2TileSK;
|
||||
|
||||
ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
|
||||
ck_tile::StreamKReductionStrategy::Reduction>
|
||||
ck_tile::StreamKReductionStrategy::Linear>
|
||||
tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
|
||||
|
||||
ck_tile::index_t expected_partials_size =
|
||||
|
||||
@@ -72,7 +72,7 @@ class TestVariant(Enum):
|
||||
)
|
||||
REDUCTION_SMOKE = (
|
||||
2,
|
||||
["reduction", "tree"],
|
||||
["linear", "tree"],
|
||||
[True, False],
|
||||
["fp16", "bf16", "fp8", "bf8"],
|
||||
"Stream-K reduction smoke tests",
|
||||
|
||||
@@ -376,8 +376,8 @@ class GemmKernelBuilder:
|
||||
|
||||
reduction_strategy_map = {
|
||||
"atomic": "ck_tile::StreamKReductionStrategy::Atomic",
|
||||
"reduction": "ck_tile::StreamKReductionStrategy::Reduction",
|
||||
"tree": "ck_tile::StreamKReductionStrategy::TreeReduction",
|
||||
"linear": "ck_tile::StreamKReductionStrategy::Linear",
|
||||
"tree": "ck_tile::StreamKReductionStrategy::Tree",
|
||||
}
|
||||
|
||||
# Determine accumulator type based on datatype
|
||||
@@ -449,7 +449,7 @@ struct SelectedKernel {{
|
||||
static constexpr bool UsePersistentKernel = {"true" if str(persistent).lower() == "true" else "false"};
|
||||
static constexpr bool UseStructuredSparsity = false;
|
||||
static constexpr ck_tile::index_t NumWaveGroups = 1;
|
||||
static constexpr ck_tile::StreamKReductionStrategy reduction_strategy = {reduction_strategy_map.get(reduction_strategy, "ck_tile::StreamKReductionStrategy::Reduction")};
|
||||
static constexpr ck_tile::StreamKReductionStrategy reduction_strategy = {reduction_strategy_map.get(reduction_strategy, "ck_tile::StreamKReductionStrategy::Linear")};
|
||||
|
||||
// Tile shape
|
||||
using TileShape = ck_tile::TileGemmShape<
|
||||
@@ -552,12 +552,12 @@ struct SelectedKernel {{
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
|
||||
}}
|
||||
else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction)
|
||||
else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Linear)
|
||||
{{
|
||||
// Reset sk flags to zero before each repetition of the kernel
|
||||
workspace_data.SetZero();
|
||||
}}
|
||||
else if(reduction_strategy == ck_tile::StreamKReductionStrategy::TreeReduction)
|
||||
else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Tree)
|
||||
{{
|
||||
// Reset sk flags to zero before each repetition of the kernel
|
||||
workspace_data.SetZero();
|
||||
|
||||
@@ -169,9 +169,9 @@ class GemmProfiler
|
||||
auto reduction_strategy =
|
||||
SelectedKernel::reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic
|
||||
? "Atomic"
|
||||
: SelectedKernel::reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction
|
||||
? "Reduction"
|
||||
: "TreeReduction";
|
||||
: SelectedKernel::reduction_strategy == ck_tile::StreamKReductionStrategy::Linear
|
||||
? "Linear"
|
||||
: "Tree";
|
||||
|
||||
KernelInstance kernel_instance{
|
||||
name, dp_persistent, reduction_strategy, gemm_problem, {-1.0f, -1.0f, -1.0f}};
|
||||
|
||||
Reference in New Issue
Block a user