diff --git a/example/ck_tile/40_streamk_gemm/README.md b/example/ck_tile/40_streamk_gemm/README.md index 0272b1fe97..e83f24f41d 100644 --- a/example/ck_tile/40_streamk_gemm/README.md +++ b/example/ck_tile/40_streamk_gemm/README.md @@ -22,7 +22,7 @@ args: -a_layout tensor A data layout (default: R) -b_layout tensor B data layout (default: C) -c_layout tensor C data layout (default: R) --reduction_strategy strategy for storing results in C tensor. atomic/reduction (default:atomic) +-reduction_strategy strategy for storing results in C tensor. atomic/linear (default:atomic) -persistent_dp persistent strategy for data-parallel section. Set to 0 for non-persistent or to 1 for persistent. (default:0) -stride_a tensor A stride (default:0) -stride_b tensor B stride (default:0) diff --git a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp index 34c6c6b0ae..7c665b38be 100644 --- a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp +++ b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp @@ -65,7 +65,7 @@ auto createArgs(int argc, char* argv[]) .insert("c_layout", "R", "C tensor data layout - Row by default") .insert("reduction_strategy", "atomic", - "strategy for storing results in C tensor - atomic/reduction") + "strategy for storing results in C tensor - atomic/linear") .insert("persistent_dp", "0", "0. Non-persistent data-parallel section, 1 Fully persistent kernel.") diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc index 7442bd33f2..cdf1874dab 100644 --- a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc +++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc @@ -119,7 +119,7 @@ std::tuple invokeGemm(ck_tile::DeviceMem& a_m_k_device_ DsLayout, CLayout, CDEElementWise, - ck_tile::StreamKReductionStrategy::Reduction>( + ck_tile::StreamKReductionStrategy::Linear>( args, ck_tile::stream_config{ nullptr, true, 1, warmup_iterations, repeat_iterations, true, flush_cache}); @@ -155,9 +155,9 @@ ck_tile::StreamKReductionStrategy getReductionStrategyValue(const std::string& s { return ck_tile::StreamKReductionStrategy::Atomic; } - else if(strategy == "reduction") + else if(strategy == "linear") { - return ck_tile::StreamKReductionStrategy::Reduction; + return ck_tile::StreamKReductionStrategy::Linear; } else { diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp index 828c861349..6df056573b 100644 --- a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp +++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp @@ -112,7 +112,7 @@ std::tuple gemm(const ck_tile::StreamKHostArgs& args, hipGetErrorString(hipMemsetAsync( args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream_config.stream_id_)); } - else if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction) + else if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Linear) { // Reset sk flags to zero before each repetition of the kernel workspace_data.SetZero(); diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp index c723251112..80e933afb3 100644 --- a/include/ck_tile/ops/common/streamk_common.hpp +++ b/include/ck_tile/ops/common/streamk_common.hpp @@ -8,8 +8,8 @@ namespace ck_tile { enum StreamKReductionStrategy : uint32_t { - Atomic = 0u, - Reduction = 1u, - TreeReduction = 2u + Atomic = 0u, + Linear = 1u, + Tree = 2u }; } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp index 47e59c4704..b986f2cb37 100644 --- a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp @@ -508,8 +508,8 @@ struct StreamKKernel { BaseGemm(kargs, tile_idx, num_loop_sk, i_k_a, i_k_b, k_size, smem_ptr_0); } - else if(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Reduction || - TilePartitioner::ReductionStrategy == StreamKReductionStrategy::TreeReduction) + else if(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Linear || + TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Tree) { const auto c_macro_tile_idx = kargs.tile_partitioner.get_output_tile_index(tile_idx); @@ -548,8 +548,7 @@ struct StreamKKernel auto tile_started = iter_start == tile_iter_start; auto tile_ended = iter_end >= tile_iter_end; - if constexpr(TilePartitioner::ReductionStrategy == - StreamKReductionStrategy::Reduction) + if constexpr(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Linear) { if(!tile_started) { diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp index f80eec844c..52cfea5872 100644 --- a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp +++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp @@ -153,8 +153,8 @@ CK_TILE_HOST_DEVICE index_t StreamKTilePartitionerBase::get_workspace_size( index_t acc_element_bytes) const noexcept { - if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction || - ReductionStrategy == StreamKReductionStrategy::TreeReduction) + if constexpr(ReductionStrategy == StreamKReductionStrategy::Linear || + ReductionStrategy == StreamKReductionStrategy::Tree) { return get_partials_buffer_size(acc_element_bytes) + get_flags_buffer_size(); diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp index 96f90a5c2d..5e3b85c009 100644 --- a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp +++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp @@ -247,17 +247,15 @@ class TestCkTileStreamK : public ::testing::Test num_accumulations_per_tile = invoke_streamk( args, ck_tile::stream_config{nullptr, false, 0, 0, 1}); } - else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction) + else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Linear) { - num_accumulations_per_tile = - invoke_streamk( - args, ck_tile::stream_config{nullptr, false, 0, 0, 1}); + num_accumulations_per_tile = invoke_streamk( + args, ck_tile::stream_config{nullptr, false, 0, 0, 1}); } else { - num_accumulations_per_tile = - invoke_streamk( - args, ck_tile::stream_config{nullptr, false, 0, 0, 1}); + num_accumulations_per_tile = invoke_streamk( + args, ck_tile::stream_config{nullptr, false, 0, 0, 1}); } c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); diff --git a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp index 30b1b878c5..75c3e0b4fb 100644 --- a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp +++ b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp @@ -56,7 +56,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsLessThan128Bytes) using Config = StreamKTilePartitionerBaseConfigDP2TileSK; ck_tile::StreamKTilePartitionerBase + ck_tile::StreamKReductionStrategy::Linear> tile_partitioner{Config::M, Config::N, Config::K, Config::GRID}; EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 128); @@ -67,7 +67,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsEqual128Bytes) using Config = StreamKTilePartitionerBaseConfigFlagsSizeEqual128Bytes; ck_tile::StreamKTilePartitionerBase + ck_tile::StreamKReductionStrategy::Linear> tile_partitioner{Config::M, Config::N, Config::K, Config::GRID}; EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 128); @@ -78,7 +78,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsGreaterThan128Bytes) using Config = StreamKTilePartitionerBaseConfigFlagsSizeGreaterThan128Bytes; ck_tile::StreamKTilePartitionerBase + ck_tile::StreamKReductionStrategy::Linear> tile_partitioner{Config::M, Config::N, Config::K, Config::GRID}; EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 256); @@ -99,7 +99,7 @@ TEST(StreamKTilePartitionerBaseGetWorkSpaceSize, ReductionStrategy) using Config = StreamKTilePartitionerBaseConfigDP2TileSK; ck_tile::StreamKTilePartitionerBase + ck_tile::StreamKReductionStrategy::Linear> tile_partitioner{Config::M, Config::N, Config::K, Config::GRID}; ck_tile::index_t expected_partials_size = diff --git a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py b/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py index ba075a2729..a485f64ade 100644 --- a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py +++ b/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py @@ -72,7 +72,7 @@ class TestVariant(Enum): ) REDUCTION_SMOKE = ( 2, - ["reduction", "tree"], + ["linear", "tree"], [True, False], ["fp16", "bf16", "fp8", "bf8"], "Stream-K reduction smoke tests", diff --git a/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py b/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py index c8d6f86ccc..4f3992bf99 100644 --- a/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py +++ b/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py @@ -376,8 +376,8 @@ class GemmKernelBuilder: reduction_strategy_map = { "atomic": "ck_tile::StreamKReductionStrategy::Atomic", - "reduction": "ck_tile::StreamKReductionStrategy::Reduction", - "tree": "ck_tile::StreamKReductionStrategy::TreeReduction", + "linear": "ck_tile::StreamKReductionStrategy::Linear", + "tree": "ck_tile::StreamKReductionStrategy::Tree", } # Determine accumulator type based on datatype @@ -449,7 +449,7 @@ struct SelectedKernel {{ static constexpr bool UsePersistentKernel = {"true" if str(persistent).lower() == "true" else "false"}; static constexpr bool UseStructuredSparsity = false; static constexpr ck_tile::index_t NumWaveGroups = 1; - static constexpr ck_tile::StreamKReductionStrategy reduction_strategy = {reduction_strategy_map.get(reduction_strategy, "ck_tile::StreamKReductionStrategy::Reduction")}; + static constexpr ck_tile::StreamKReductionStrategy reduction_strategy = {reduction_strategy_map.get(reduction_strategy, "ck_tile::StreamKReductionStrategy::Linear")}; // Tile shape using TileShape = ck_tile::TileGemmShape< @@ -552,12 +552,12 @@ struct SelectedKernel {{ hipGetErrorString(hipMemsetAsync( args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_)); }} - else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction) + else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Linear) {{ // Reset sk flags to zero before each repetition of the kernel workspace_data.SetZero(); }} - else if(reduction_strategy == ck_tile::StreamKReductionStrategy::TreeReduction) + else if(reduction_strategy == ck_tile::StreamKReductionStrategy::Tree) {{ // Reset sk flags to zero before each repetition of the kernel workspace_data.SetZero(); diff --git a/tile_engine/ops/gemm_streamk/gemm_streamk_profiler.hpp b/tile_engine/ops/gemm_streamk/gemm_streamk_profiler.hpp index 2a7b07c698..577524b381 100644 --- a/tile_engine/ops/gemm_streamk/gemm_streamk_profiler.hpp +++ b/tile_engine/ops/gemm_streamk/gemm_streamk_profiler.hpp @@ -169,9 +169,9 @@ class GemmProfiler auto reduction_strategy = SelectedKernel::reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic ? "Atomic" - : SelectedKernel::reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction - ? "Reduction" - : "TreeReduction"; + : SelectedKernel::reduction_strategy == ck_tile::StreamKReductionStrategy::Linear + ? "Linear" + : "Tree"; KernelInstance kernel_instance{ name, dp_persistent, reduction_strategy, gemm_problem, {-1.0f, -1.0f, -1.0f}};