diff --git a/example/ck_tile/40_streamk_gemm/CMakeLists.txt b/example/ck_tile/40_streamk_gemm/CMakeLists.txt new file mode 100644 index 0000000000..3539dee05b --- /dev/null +++ b/example/ck_tile/40_streamk_gemm/CMakeLists.txt @@ -0,0 +1,5 @@ +if(GPU_TARGETS MATCHES "gfx9") + add_executable(tile_example_streamk_gemm_basic EXCLUDE_FROM_ALL streamk_gemm_basic.cpp) +else() + message(DEBUG "Skipping ck_tile streamk gemm tests for current target") +endif() diff --git a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp new file mode 100644 index 0000000000..60c92bc356 --- /dev/null +++ b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp @@ -0,0 +1,132 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" + +#define CK_TILE_PIPELINE_MEMORY 1 + +struct GemmConfigBase +{ + static constexpr bool kPadM = true; + static constexpr bool kPadN = true; + static constexpr bool kPadK = true; + + static constexpr bool PermuteA = false; + static constexpr bool PermuteB = false; + + static constexpr bool TransposeC = false; + static constexpr bool UseStructuredSparsity = false; + static constexpr bool Persistent = false; + + static constexpr int kBlockPerCu = 1; + static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave; + static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY; + static constexpr ck_tile::index_t NumWaveGroups = 1; + static constexpr bool Preshuffle = false; + static constexpr bool DoubleSmemBuffer = false; +}; + +template +struct GemmConfigMemoryInterwave : public GemmConfigBase +{ + static constexpr ck_tile::index_t M_Tile = 128; + static constexpr ck_tile::index_t N_Tile = 128; + static constexpr ck_tile::index_t K_Tile = 32; + + static constexpr ck_tile::index_t M_Warp = 2; + static constexpr ck_tile::index_t N_Warp = 2; + static constexpr ck_tile::index_t K_Warp = 1; + + static constexpr ck_tile::index_t M_Warp_Tile = 32; + static constexpr ck_tile::index_t N_Warp_Tile = 32; + static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16; + + static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY; + static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave; +}; + +template +struct PipelineTypeTraits; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem; +}; + +template +struct StreamKGemmTypeConfig +{ + using ADataType = ADataType_; + using BDataType = BDataType_; + using AccDataType = float; + using CDataType = CDataType_; +}; + +template +struct DataTypeTraits; + +template <> +struct DataTypeTraits +{ + static constexpr const char* name = "fp32"; +}; + +template <> +struct DataTypeTraits +{ + static constexpr const char* name = "fp16"; +}; + +template <> +struct DataTypeTraits +{ + static constexpr const char* name = "bf16"; +}; + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "512", "m dimension") + .insert("n", "512", "n dimension") + .insert("k", "512", "k dimension") + .insert("a_layout", "R", "A tensor data layout - Row by default") + .insert("b_layout", "C", "B tensor data layout - Column by default") + .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("num_sk_blocks", + "-1", + "number of Stream-K blocks. -1: chosen by algorithm, or user selected") + .insert("reduction_strategy", + "atomic", + "strategy for storing results in C tensor - atomic/reduction") + .insert( + "occupancy", + "-1", + "maximum number of workgroups per CU - value of -1 queries occupancy from the device") + .insert("num_cu", + "-1", + "number of compute units (CUs) - value of -1 uses number of CUs on the device") + .insert("stride_a", "0", "Tensor A stride") + .insert("stride_b", "0", "Tensor B stride") + .insert("stride_c", "0", "Tensor C stride") + .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") + .insert("prec", "fp16", "data type. fp16/bf16") + .insert("warmup", "50", "number of iterations before benchmarking the kernel") + .insert("repeat", "100", "number of iterations to benchmark the kernel") + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") + .insert("init", "0", "0:random, 1:linear, 2:constant(1)") + .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc new file mode 100644 index 0000000000..b7204f2559 --- /dev/null +++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc @@ -0,0 +1,377 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once + +// Estimate the number of WGs contributing to the same macro tile in C +template +int estimate_num_wgs_per_tile(const TilePartitioner& tile_partitioner) +{ + // In the case of non-atomic reduction or DP only, there will always be 1 WG contributing to a + // macro time in C + int num_wgs_per_tile = 1; + + // Otherwise, for atomics, multiple WGs may be contributing to the same macro tile in C + if(tile_partitioner.sk_num_blocks > 0 && + ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic) + { + // Determine the number of iterations per WG for a given macro tile in C + uint32_t k_iters_per_block = tile_partitioner.k_iters_per_big_block - 1; + + // Estimate the number of WGs per macro tile + num_wgs_per_tile = (tile_partitioner.k_iters_per_tile.get() / (k_iters_per_block)) + + ((tile_partitioner.k_iters_per_tile.get() % k_iters_per_block) != 0); + } + + return std::max(num_wgs_per_tile, 1); +} + +template +static constexpr inline auto is_row_major(Layout layout_) +{ + return ck_tile::bool_constant, + ck_tile::tensor_layout::gemm::RowMajor>>{}; +} + +template +auto calculate_rtol_atol(const ck_tile::index_t K, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(K, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); + // Calculate error due to multiple WGs working in the same C macro tile + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +template +std::tuple gemm(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s, + int num_cu, + int occupancy); + +template +std::tuple invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C, + int n_warmup, + int n_repeat, + bool flush_cache, + ck_tile::StreamKReductionStrategy reduction_strategy, + uint32_t num_sk_blocks, + int num_cu, + int occupancy) +{ + ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(), + b_k_n_dev_buf.GetDeviceBuffer(), + c_m_n_dev_buf.GetDeviceBuffer(), + M, + N, + K, + stride_A, + stride_B, + stride_C, + reduction_strategy, + num_sk_blocks}; + + std::tuple ave_time_and_batch; + + if(args.reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic) + { + ave_time_and_batch = gemm( + args, + ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, flush_cache}, + num_cu, + occupancy); + } + else /*Reduction*/ + { + ave_time_and_batch = gemm( + args, + ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, flush_cache}, + num_cu, + occupancy); + } + + return ave_time_and_batch; +} + +template +bool do_verify(const ck_tile::HostTensor& c_m_n_dev_result, + const ck_tile::HostTensor& c_m_n_ref, + const ck_tile::tuple& rtol_atol, + const char* variant) +{ + bool pass = ck_tile::check_err(c_m_n_dev_result, + c_m_n_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl; + std::cout << "The " << variant << " verification result is:" << (pass ? "correct" : "fail") + << std::endl; + return pass; +} + +ck_tile::StreamKReductionStrategy get_reduction_strategy_value(const std::string& strategy) +{ + if(strategy == "atomic") + { + return ck_tile::StreamKReductionStrategy::Atomic; + } + else if(strategy == "reduction") + { + return ck_tile::StreamKReductionStrategy::Reduction; + } + else + { + throw std::runtime_error("Unsupported Stream-K reduction strategy !!!"); + } +} + +void validate_num_cu_and_occupancy(int num_cu, int occupancy) +{ + if((num_cu == -1) != (occupancy == -1)) + { + throw std::runtime_error("Arguments num_cu and occupancy must both use either (a) " + "default values (-1) or (b) non-default values."); + } +} + +template +int run_gemm_example_with_layouts(int argc, + char* argv[], + const ALayout a_layout = ALayout{}, + const BLayout b_layout = BLayout{}, + [[maybe_unused]] const CLayout c_layout = CLayout{}) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + static_assert(!GemmConfig::Preshuffle, "Not implemented"); + static_assert(!GemmConfig::UseStructuredSparsity, "Not implemented"); + static_assert(!GemmConfig::PermuteA, "Not implemented"); + static_assert(!GemmConfig::PermuteB, "Not implemented"); + + using ADataType = typename TypeConfig::ADataType; + using BDataType = typename TypeConfig::BDataType; + using AccDataType = typename TypeConfig::AccDataType; + using CDataType = typename TypeConfig::CDataType; + + ck_tile::index_t M = arg_parser.get_int("m"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t K = arg_parser.get_int("k"); + + ck_tile::index_t stride_A = arg_parser.get_int("stride_a"); + ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); + ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); + + int n_warmup = arg_parser.get_int("warmup"); + int n_repeat = arg_parser.get_int("repeat"); + ck_tile::index_t init_method = arg_parser.get_int("init"); + bool flush_cache = arg_parser.get_bool("flush_cache"); + + ck_tile::StreamKReductionStrategy reduction_strategy = + get_reduction_strategy_value(arg_parser.get_str("reduction_strategy")); + uint32_t num_sk_blocks = static_cast(arg_parser.get_int("num_sk_blocks")); + int num_cu = arg_parser.get_int("num_cu"); + int occupancy = arg_parser.get_int("occupancy"); + + validate_num_cu_and_occupancy(num_cu, occupancy); + + stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout)); + stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout)); + stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{})); + + ck_tile::HostTensor a_m_k( + ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout))); + ck_tile::HostTensor b_k_n( + ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout))); + ck_tile::HostTensor c_m_n_dev_result( + ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{}))); + + if(init_method == 0) + { + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + } + else if(init_method == 1) + { + ck_tile::FillMonotonicSeq{}(a_m_k); + ck_tile::FillMonotonicSeq{}(b_k_n); + } + else if(init_method == 2) + { + ck_tile::FillUniformDistribution{1.f, 1.f}(a_m_k); + ck_tile::FillUniformDistribution{1.f, 1.f}(b_k_n); + } + else + { + a_m_k.SetZero(); + b_k_n.SetZero(); + } + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + auto [ave_time, num_wgs_per_tile] = invoke_gemm, + AccDataType, + CDataType, + ALayout, + BLayout, + ck_tile::tuple<>, + CLayout>(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_dev_buf, + M, + N, + K, + stride_A, + stride_B, + stride_C, + n_warmup, + n_repeat, + flush_cache, + reduction_strategy, + num_sk_blocks, + num_cu, + occupancy); + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_byte = + sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N; + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K + << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C + << " A_Layout=" << ALayout::name << " B_Layout=" << BLayout::name + << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits::name + << " B_Type=" << DataTypeTraits::name + << " C_Type=" << DataTypeTraits::name + << " reduction_strategy=" << arg_parser.get_str("reduction_strategy") << " " + << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << std::endl; + + bool pass = true; + + // Memory on host to store gpu reference result + ck_tile::HostTensor c_m_n_ref( + ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{}))); + c_m_n_ref.SetZero(); + + if(arg_parser.get_int("v") == 1) // Validate on the CPU + { + ck_tile::reference_gemm( + a_m_k, b_k_n, c_m_n_ref); + const float max_accumulated_value = + *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end()); + const auto rtol_atol = calculate_rtol_atol( + K, num_wgs_per_tile, max_accumulated_value); + pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU"); + } + else if(arg_parser.get_int("v") == 2) // Validate on the GPU + { + // Memory on device to store gpu reference result + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes()); + c_m_n_gpu_buf_ref.SetZero(); + + ADataType* d_A = static_cast(a_m_k_dev_buf.GetDeviceBuffer()); + BDataType* d_B = static_cast(b_k_n_dev_buf.GetDeviceBuffer()); + CDataType* d_C = static_cast(c_m_n_gpu_buf_ref.GetDeviceBuffer()); + + ck_tile::reference_gemm_gpu(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data()); + + const float max_accumulated_value = + *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end()); + const auto rtol_atol = calculate_rtol_atol( + K, num_wgs_per_tile, max_accumulated_value); + pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU"); + } + + return pass; +} diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp new file mode 100644 index 0000000000..5b0d3464b7 --- /dev/null +++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp @@ -0,0 +1,202 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "gemm_utils.hpp" +#include "run_gemm_example.inc" + +template +std::tuple gemm(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s, + int num_cu, + int occupancy) + +{ + using GemmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile:: + sequence, + GemmConfig::PermuteA, + GemmConfig::PermuteB>; + + using TilePartitioner = ck_tile::StreamKTilePartitioner; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits; + + const auto Run = [&](const auto memory_operation_) -> std::tuple { + constexpr auto memory_operation = memory_operation_.value; + constexpr auto scheduler = GemmConfig::Scheduler; + + // We create the GEMM pipeline without specifying has_hot_loop or tail_num. + // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K + // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K + // Kernel's RunGemm function. This is a similar pattern used by grouped GEMM. + using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; + + using GemmPipeline = typename PipelineTypeTraits< + GemmConfig::Pipeline>::template GemmPipeline; + + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; + + using Kernel = ck_tile::StreamKKernel; + + auto kargs = (num_cu == -1 && occupancy == -1) + ? Kernel::MakeKernelArgs(args) + : Kernel::MakeKernelArgs(args, num_cu, occupancy); + + dim3 grids = Kernel::GridSize(kargs.tile_partitioner); + dim3 blocks = Kernel::BlockSize(); + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' + << "shape: " << GemmShape::GetName() << '\n' + << "problem: " << UniversalGemmProblem::GetName() << '\n' + << "pipeline: " << GemmPipeline::GetName() << '\n' + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + // Function to clear the output C tensor results after each repetition of the kernel + auto clear_gemm_output = [&]() { + if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic) + hipGetErrorString(hipMemsetAsync( + args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); + }; + + std::function preprocess = clear_gemm_output; + + float ave_time = ck_tile::launch_kernel_time_mask( + s, + preprocess, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + int num_wgs_per_tile = estimate_num_wgs_per_tile(kargs.tile_partitioner); + + return std::tuple{ave_time, num_wgs_per_tile}; + }; + + if constexpr(ck_tile::StreamKReductionStrategy::Atomic == ReductionStrategy) + { + return Run(ck_tile::integral_constant{}); + } + else // We are using ck_tile::StreamKReductionStrategy::Reduction + { + return Run(ck_tile::integral_constant{}); + } +} + +template +int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[]) +{ + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + if(a_layout == "R" && b_layout == "C") + { + return run_gemm_example_with_layouts( + argc, argv, Row{}, Col{}, Row{}); + } + else + { + throw std::runtime_error("Unsupported layouts."); + } + + return 0; +} + +template