From ed385de9b26a1e6b99c3508cdbba747b60da5ddd Mon Sep 17 00:00:00 2001 From: aledudek Date: Fri, 29 Nov 2024 11:52:18 +0100 Subject: [PATCH] Ck tile batched gemm example (#1615) * [CK Tile] Batched GEMM Example * [CK Tile] Batched GEMM Example - minor refactor * [CK Tile] Batched GEMM Example - README update * [CK Tile] Batched Gemm Example - review changes - Added tensor data layours as input parameters - Changed structure of Host and Kernel args - Removed bug with invalid vector read on non-contiguous memory * [CK Tile] Batched Gemm Example - remove comment * [CK Tile] Batched Gemm Example - Add GTests part1 * [CK Tile] Batched Gemm Example - GTests part2 + review changes * [CK TILE] Batched GEMM post merge fixes * [CK Tile] Batched GEMM Example - fix pad views [ROCm/composable_kernel commit: 78f0fea08eafa7e3da49cbb3d77c962cecb3ae0b] --- .../ck_tile/16_batched_gemm/CMakeLists.txt | 1 + example/ck_tile/16_batched_gemm/README.md | 37 +++ .../ck_tile/16_batched_gemm/batched_gemm.cpp | 103 +++++++ .../ck_tile/16_batched_gemm/batched_gemm.hpp | 63 +++++ .../run_batched_gemm_example.inc | 253 +++++++++++++++++ example/ck_tile/CMakeLists.txt | 2 +- .../ck_tile/host/reference/reference_gemm.hpp | 112 ++++++++ include/ck_tile/ops/gemm.hpp | 1 + .../ops/gemm/kernel/batched_gemm_kernel.hpp | 258 ++++++++++++++++++ .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 2 +- test/ck_tile/CMakeLists.txt | 1 + test/ck_tile/batched_gemm/CMakeLists.txt | 4 + .../batched_gemm/test_batched_gemm.cpp | 29 ++ .../test_batched_gemm_ut_cases.inc | 9 + .../batched_gemm/test_batched_gemm_util.hpp | 225 +++++++++++++++ 15 files changed, 1098 insertions(+), 2 deletions(-) create mode 100644 example/ck_tile/16_batched_gemm/CMakeLists.txt create mode 100644 example/ck_tile/16_batched_gemm/README.md create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.cpp create mode 100644 example/ck_tile/16_batched_gemm/batched_gemm.hpp create mode 100644 example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc create mode 100644 include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp create mode 100644 test/ck_tile/batched_gemm/CMakeLists.txt create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm.cpp create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc create mode 100644 test/ck_tile/batched_gemm/test_batched_gemm_util.hpp diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt new file mode 100644 index 0000000000..78e78c6b04 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt @@ -0,0 +1 @@ +add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp) diff --git a/example/ck_tile/16_batched_gemm/README.md b/example/ck_tile/16_batched_gemm/README.md new file mode 100644 index 0000000000..34b56db526 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/README.md @@ -0,0 +1,37 @@ +# Batched GEMM + +This folder contains example for batched GEMM using ck_tile tile-programming implementation. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ +make tile_example_batched_gemm -j +``` +This will result in an executable `build/bin/tile_example_batched_gemm` + +## example +``` +args: + -m m dimension (default:256) + -n n dimension (default:128) + -k k dimension (default:128) + -a_layout A tensor data layout (default:R) (R for Row, C for Col) + -b_layout B tensor data layout (default:R) (R for Row, C for Col) + -c_layout C tensor data layout (default:R) (R for Row, C for Col) + -stride_a Tensor A stride (default:128) + -stride_b Tensor B stride (default:128) + -stride_c Tensor C stride (default:128) + -batch_stride_a Batch A stride (default:32768) + -batch_stride_b Batch B stride (default:16384) + -batch_stride_c Batch C stride (default:32768) + -batch_count Batch count (default:16) + -v 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2) + -e Absolute error tolerance (default:1e-5) + -prec data type. fp16/bf16/fp8/bf8 (default:fp16) + -warmup number of iterations before benchmark the kernel (default:10) + -repeat number of iterations to benchmark the kernel (default:100) + -timer gpu:gpu timer, cpu:cpu timer (default:gpu) +``` \ No newline at end of file diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp new file mode 100644 index 0000000000..bfdd74126e --- /dev/null +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" +#include "batched_gemm.hpp" + +template +float batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) +{ + // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + constexpr bool kTilePermute = false; + // The rank and permutation will also be generate out by the CodeGen part. + constexpr ck_tile::index_t kOutputRank = 2; + + constexpr int kBlockPerCu = 1; + + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + // Whether doing the CShuffle (transpose before the global memory), depending on the output + // layout. + constexpr bool CShuffleEpilogue = + std::is_same_v; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = std::conditional_t< + CShuffleEpilogue, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + using CodegenGemmTraits = + ck_tile::TileGemmTraits; + + using CodegenPipelineProblem = ck_tile:: + GemmPipelineProblem; + + using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + // ToDo: Will add the codegen part to test different pipeline policies in GEMM. + // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. + using Kernel = ck_tile::BatchedGemmKernel; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; +} + +#include "run_batched_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); } diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp new file mode 100644 index 0000000000..e252c0f673 --- /dev/null +++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" + +template +struct BatchedGemmTypeConfig; + +template <> +struct BatchedGemmTypeConfig +{ + using ADataType = ck_tile::half_t; + using BDataType = ck_tile::half_t; + using AccDataType = float; + using CDataType = ck_tile::half_t; +}; + +using Types = BatchedGemmTypeConfig; + +// Specific type aliases for easy access +using ADataType = Types::ADataType; +using BDataType = Types::BDataType; +using AccDataType = Types::AccDataType; +using CDataType = Types::CDataType; + +struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs +{ +}; + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "256", "m dimension") + .insert("n", "128", "n dimension") + .insert("k", "128", "k dimension") + .insert("stride_a", "0", "Tensor A stride") + .insert("stride_b", "0", "Tensor B stride") + .insert("stride_c", "0", "Tensor C stride") + .insert("a_layout", "R", "A tensor data layout - Row by default") + .insert("b_layout", "R", "B tensor data layout - Row by default") + .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("batch_stride_a", "32768", "Batch A stride") + .insert("batch_stride_b", "16384", "Batch B stride") + .insert("batch_stride_c", "32768", "Batch C stride") + .insert("batch_count", "16", "Batch count") + .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") + .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") + .insert("warmup", "50", "number of iterations before benchmark the kernel") + .insert("repeat", "100", "number of iterations to benchmark the kernel") + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// host API +float batched_gemm(batched_gemm_kargs args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc new file mode 100644 index 0000000000..dacca2042e --- /dev/null +++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C, + ck_tile::index_t batch_stride_A, + ck_tile::index_t batch_stride_B, + ck_tile::index_t batch_stride_C, + ck_tile::index_t batch_count, + int n_warmup, + int n_repeat) +{ + batched_gemm_kargs args; + args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + args.M = M; + args.N = N; + args.K = K; + args.stride_A = stride_A; + args.stride_B = stride_B; + args.stride_C = stride_C; + args.batch_stride_A = batch_stride_A; + args.batch_stride_B = batch_stride_B; + args.batch_stride_C = batch_stride_C; + args.batch_count = batch_count; + + float ave_time = batched_gemm( + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); + + std::string op_name{"Batched Gemm"}; + std::size_t flop = std::size_t(2) * batch_count * M * N * K; + std::size_t num_byte = sizeof(ADataType) * batch_count * M * K + + sizeof(BDataType) * batch_count * N * K + + sizeof(CDataType) * batch_count * M * N; + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Run " << op_name << "kernel with M =" << M << " N =" << N << " K =" << K + << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C + << " batch_stride_A =" << batch_stride_A << " batch_stride_B =" << batch_stride_B + << " batch_stride_C =" << batch_stride_C << " batch_count =" << batch_count << " : " + << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << std::endl; + + return ave_time; +} + +template +int run_batched_gemm_example_with_layouts(int argc, + char* argv[], + const ALayout a_layout = ALayout{}, + const BLayout b_layout = BLayout{}, + [[maybe_unused]] const CLayout c_layout = CLayout{}) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + ck_tile::index_t M = arg_parser.get_int("m"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t K = arg_parser.get_int("k"); + + ck_tile::index_t stride_A = arg_parser.get_int("stride_a"); + ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); + ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); + + ck_tile::index_t batch_stride_A = arg_parser.get_int("batch_stride_a"); + ck_tile::index_t batch_stride_B = arg_parser.get_int("batch_stride_b"); + ck_tile::index_t batch_stride_C = arg_parser.get_int("batch_stride_c"); + ck_tile::index_t batch_count = arg_parser.get_int("batch_count"); + + int n_warmup = arg_parser.get_int("warmup"); + int n_repeat = arg_parser.get_int("repeat"); + + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = [](std::size_t batch_count_, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, 1_uz, stride}); + } + }; + + auto f_get_default_stride = [](std::size_t row, + std::size_t col, + std::size_t stride, + auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + stride_A = f_get_default_stride(M, K, stride_A, a_layout); + stride_B = f_get_default_stride(K, N, stride_B, b_layout); + stride_C = f_get_default_stride(M, N, stride_C, c_layout); + + ck_tile::HostTensor a_m_k( + f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, a_layout)); + ck_tile::HostTensor b_k_n( + f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, b_layout)); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, c_layout)); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + invoke_batched_gemm(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_dev_buf, + M, + N, + K, + stride_A, + stride_B, + stride_C, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_count, + n_warmup, + n_repeat); + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + if(arg_parser.get_int("v") == 1) + { + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); + c_m_n_host_ref.SetZero(); + + const auto b_n_k = b_k_n.transpose({0, 2, 1}); + + ck_tile::reference_batched_gemm( + a_m_k, b_n_k, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + + std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl; + } + else if(arg_parser.get_int("v") == 2) + { + ck_tile::HostTensor c_m_n_gpu_ref( + f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes()); + c_m_n_gpu_ref.SetZero(); + c_m_n_gpu_buf_ref.SetZero(); + + ck_tile::reference_batched_gemm_gpu(a_m_k_dev_buf, + b_k_n_dev_buf, + c_m_n_gpu_buf_ref, + M, + N, + K, + stride_A, + stride_B, + stride_C, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_count); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data()); + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_gpu_ref); + + std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl; + } + + return pass; +} + +int run_batched_gemm_example(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + std::string a_layout = arg_parser.get_str("a_layout"); + std::string b_layout = arg_parser.get_str("b_layout"); + + if(a_layout == "R" && b_layout == "R") + { + return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{}); + } + else if(a_layout == "R" && b_layout == "C") + { + return run_batched_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{}); + } + // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not + // work else if(a_layout == "C" && b_layout == "C") + // { + // return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Col{}, Row{}); + // } + // else if(a_layout == "C" && b_layout == "R") + // { + // return run_batched_gemm_example_with_layouts(argc, argv, Col{}, Row{}, Row{}); + // } + else + { + throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); + } +} diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 29305405bc..51ebb5bf07 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -15,4 +15,4 @@ add_subdirectory(12_smoothquant) add_subdirectory(13_moe_sorting) add_subdirectory(14_moe_smoothquant) add_subdirectory(15_fused_moe) - +add_subdirectory(16_batched_gemm) diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp index dbdef0e9c7..8bd1f5b048 100644 --- a/include/ck_tile/host/reference/reference_gemm.hpp +++ b/include/ck_tile/host/reference/reference_gemm.hpp @@ -183,4 +183,116 @@ void reference_gemm_gpu(DeviceMem& a_device, return; } + +template +void reference_batched_gemm_gpu(DeviceMem& a_device, + DeviceMem& b_device, + DeviceMem& c_device, + index_t M, + index_t N, + index_t K, + index_t stride_a, + index_t stride_b, + index_t stride_c, + index_t batch_stride_A, + index_t batch_stride_B, + index_t batch_stride_C, + index_t batch_count) +{ + + ADataType* d_A; + BDataType* d_B; + CDataType* d_C; + + hipError_t errA = hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType)); + hipError_t errB = hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType)); + hipError_t errC = hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType)); + if(errA != hipSuccess) + { + std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA) + << std::endl; + return; // Early exit on error + } + + if(errB != hipSuccess) + { + std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB) + << std::endl; + return; // Early exit on error + } + + if(errC != hipSuccess) + { + std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC) + << std::endl; + return; // Early exit on error + } + + errA = hipMemcpy(d_A, + a_device.GetDeviceBuffer(), + batch_count * M * K * sizeof(ADataType), + hipMemcpyHostToDevice); + if(errA != hipSuccess) + { + std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipMemcpy(d_B, + b_device.GetDeviceBuffer(), + batch_count * N * K * sizeof(BDataType), + hipMemcpyHostToDevice); + if(errB != hipSuccess) + { + std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl; + } + + int totalElements = M * N; + int numThreadsPerBlock = 256; // Common choice for threads per block + int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock; + + for(index_t batch_id = 0; batch_id < batch_count; ++batch_id) + { + ADataType* d_ATemp = d_A + batch_id * batch_stride_A; + BDataType* d_BTemp = d_B + batch_id * batch_stride_B; + CDataType* d_CTemp = d_C + batch_id * batch_stride_C; + naive_gemm_kernel + <<>>( + d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c); + } + + errC = hipMemcpy(c_device.GetDeviceBuffer(), + d_C, + batch_count * M * N * sizeof(CDataType), + hipMemcpyDeviceToHost); + if(errC != hipSuccess) + { + std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl; + } + + errA = hipFree(d_A); + if(errA != hipSuccess) + { + std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipFree(d_B); + if(errB != hipSuccess) + { + std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl; + } + + errC = hipFree(d_C); + if(errC != hipSuccess) + { + std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl; + } + + return; +} } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 1340fb2048..b9eb248581 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -25,6 +25,7 @@ #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp" diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp new file mode 100644 index 0000000000..07b4af5730 --- /dev/null +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +struct BatchedGemmHostArgs +{ + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; + index_t batch_stride_A; + index_t batch_stride_B; + index_t batch_stride_C; + index_t batch_count; +}; + +template +struct BatchedGemmKernel +{ + using TilePartitioner = remove_cvref_t; + using GemmPipeline = remove_cvref_t; + using EpiloguePipeline = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; + static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using CDataType = remove_cvref_t; + + struct BatchedGemmKargs + { + const void* a_ptr; + const void* b_ptr; + void* c_ptr; + index_t M; + index_t N; + index_t K; + index_t stride_A; + index_t stride_B; + index_t stride_C; + index_t batch_stride_A; + index_t batch_stride_B; + index_t batch_stride_C; + index_t batch_count; + }; + + using Kargs = BatchedGemmKargs; + using Hargs = BatchedGemmHostArgs; + + __host__ static constexpr auto GridSize(const Hargs& h) + { + return TilePartitioner::GridSize(h.M, h.N, h.batch_count); + } + + __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } + + CK_TILE_HOST static constexpr BatchedGemmKargs MakeKargs(const Hargs& h) + { + Kargs k; + k.a_ptr = h.a_ptr; + k.b_ptr = h.b_ptr; + k.c_ptr = h.c_ptr; + k.M = h.M; + k.N = h.N; + k.K = h.K; + k.stride_A = h.stride_A; + k.stride_B = h.stride_B; + k.stride_C = h.stride_C; + k.batch_stride_A = h.batch_stride_A; + k.batch_stride_B = h.batch_stride_B; + k.batch_stride_C = h.batch_stride_C; + k.batch_count = h.batch_count; + return k; + } + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); + } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const auto [i_m, i_n] = TilePartitioner{}(); + const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.z); + + // options + const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A); + const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A); + const ADataType* a_start = static_cast(kargs.a_ptr); + + const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B); + const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B); + const BDataType* b_start = static_cast(kargs.b_ptr); + + // Convert pointers to tensor views + auto a_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + a_start + batch_offset_A, + make_tuple(kargs.M, kargs.K), + make_tuple(kargs.stride_A, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + a_start + batch_offset_A, + make_tuple(kargs.M, kargs.K), + make_tuple(1, kargs.stride_A), + number<1>{}, + number<1>{}); + } + }(); + + auto b_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + b_start + batch_offset_B, + make_tuple(kargs.N, kargs.K), + make_tuple(1, kargs.stride_B), + number<1>{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + b_start + batch_offset_B, + make_tuple(kargs.N, kargs.K), + make_tuple(kargs.stride_B, 1), + number{}, + number<1>{}); + } + }(); + + auto a_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + a_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + // clang-format on + + auto a_block_window = make_tile_window( + a_pad_view, + make_tuple(number{}, number{}), + {i_m, 0}); + + auto b_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + b_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + // clang-format on + + auto b_block_window = make_tile_window( + b_pad_view, + make_tuple(number{}, number{}), + {i_n, 0}); + + // allocate LDS + __shared__ char smem_ptr[GetSmemSize()]; + + const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); + + // Run GEMM cooperatively by whole wokrgroup. + auto c_block_tile = + GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); + + const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C); + const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C); + CDataType* c_start = static_cast(kargs.c_ptr); + auto c_tensor_view = [&]() { + if constexpr(std::is_same_v) + { + return make_naive_tensor_view( + c_start + batch_offset_C, + make_tuple(kargs.M, kargs.N), + make_tuple(kargs.stride_C, 1), + number{}, + number<1>{}); + } + else + { + return make_naive_tensor_view( + c_start + batch_offset_C, + make_tuple(kargs.M, kargs.N), + make_tuple(1, kargs.stride_C), + number<1>{}, + number<1>{}); + } + }(); + + auto c_pad_view = [&]() { + if constexpr(std::is_same_v) + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + else + { + return pad_tensor_view( + c_tensor_view, + make_tuple(number{}, number{}), + sequence{}); + } + }(); + auto c_block_window = make_tile_window( + c_pad_view, + make_tuple(number{}, number{}), + {i_m, i_n}); + + EpiloguePipeline{}(c_block_window, c_block_tile); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index c0817e736b..822748c69b 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -124,7 +124,7 @@ struct GemmPipelineAGmemBGmemCRegV1 b_lds_block, make_tuple(number{}, number{}), {0, 0}); // Block GEMM - constexpr auto block_gemm = Policy::template GetBlockGemm(); + auto block_gemm = Policy::template GetBlockGemm(); // Acc register tile auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){}; diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index ac9c4311df..fd0de0f9c1 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(image_to_column) add_subdirectory(gemm) +add_subdirectory(batched_gemm) diff --git a/test/ck_tile/batched_gemm/CMakeLists.txt b/test/ck_tile/batched_gemm/CMakeLists.txt new file mode 100644 index 0000000000..532ead1124 --- /dev/null +++ b/test/ck_tile/batched_gemm/CMakeLists.txt @@ -0,0 +1,4 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_batched_gemm test_batched_gemm.cpp) +endif() diff --git a/test/ck_tile/batched_gemm/test_batched_gemm.cpp b/test/ck_tile/batched_gemm/test_batched_gemm.cpp new file mode 100644 index 0000000000..29bed8d2fd --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm.cpp @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_batched_gemm_util.hpp" + +using F16 = ck_tile::half_t; +using F32 = float; + +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType + std::tuple< Row, Row, Row, F16, F16, F32, F16>, + //std::tuple< Col, Row, Row, F16, F16, F32, F16>, + std::tuple< Row, Col, Row, F16, F16, F32, F16>//, + //std::tuple< Col, Col, Row, F16, F16, F32, F16> + >; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileBatchedGemm, KernelTypes); + +#include "test_batched_gemm_ut_cases.inc" diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc new file mode 100644 index 0000000000..f261164d61 --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc @@ -0,0 +1,9 @@ +#pragma once + +TYPED_TEST(TestCkTileBatchedGemm, Basic) +{ + constexpr int M = 256; + constexpr int N = 128; + constexpr int K = 128; + this->Run(M, N, K); +} diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp new file mode 100644 index 0000000000..88145b987b --- /dev/null +++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" + +template +class TestCkTileBatchedGemm : public ::testing::Test +{ + protected: + using ALayout = std::tuple_element_t<0, Tuple>; + using BLayout = std::tuple_element_t<1, Tuple>; + using CLayout = std::tuple_element_t<2, Tuple>; + using ADataType = std::tuple_element_t<3, Tuple>; + using BDataType = std::tuple_element_t<4, Tuple>; + using AccDataType = std::tuple_element_t<5, Tuple>; + using CDataType = std::tuple_element_t<6, Tuple>; + + struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs + { + }; + + template + void invoke_batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s) + { + // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + constexpr bool kTilePermute = false; + // The rank and permutation will also be generate out by the CodeGen part. + constexpr ck_tile::index_t kOutputRank = 2; + + constexpr int kBlockPerCu = 1; + + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + // Whether doing the CShuffle (transpose before the global memory), depending on the output + // layout. + constexpr bool CShuffleEpilogue = + std::is_same_v; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTilePartitioner; + + using GemmEpilogue = std::conditional_t< + CShuffleEpilogue, + ck_tile::CShuffleEpilogue>, + ck_tile::Default2DEpilogue< + ck_tile::Default2DEpilogueProblem>>; + + using CodegenGemmTraits = + ck_tile::TileGemmTraits; + + using CodegenPipelineProblem = ck_tile::GemmPipelineProblem; + + using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using Kernel = + ck_tile::BatchedGemmKernel; + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + } + + public: + void Run(const int M, + const int N, + const int K, + int StrideA = 128, + int StrideB = 128, + int StrideC = 128, + const int BatchStrideA = 32768, + const int BatchStrideB = 16384, + const int BatchStrideC = 32768, + const int BatchCount = 16) + { + using namespace ck_tile::literals; + + auto f_host_tensor_descriptor = [](std::size_t batch_count_, + std::size_t row, + std::size_t col, + std::size_t stride, + std::size_t batch_stride, + auto layout) { + if constexpr(std::is_same_v) + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, stride, 1_uz}); + } + else + { + return ck_tile::HostTensorDescriptor({batch_count_, row, col}, + {batch_stride, 1_uz, stride}); + } + }; + + auto f_get_default_stride = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(stride == 0) + { + // give a chance if stride is zero, return a default packed stride + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; + }; + + StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); + StrideB = f_get_default_stride(K, N, StrideB, BLayout{}); + StrideC = f_get_default_stride(M, N, StrideC, CLayout{}); + + ck_tile::HostTensor a_m_k( + f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{})); + ck_tile::HostTensor b_k_n( + f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{})); + ck_tile::HostTensor c_m_n_dev_result( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + b_k_n_dev_buf.ToDevice(b_k_n.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + batched_gemm_kargs kargs{a_m_k_dev_buf.GetDeviceBuffer(), + b_k_n_dev_buf.GetDeviceBuffer(), + c_m_n_dev_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + BatchStrideA, + BatchStrideB, + BatchStrideC, + BatchCount}; + + invoke_batched_gemm(kargs, + ck_tile::stream_config{nullptr, false}); + + std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K + << " StrideA =" << StrideA << " StrideB =" << StrideB << " StrideC =" << StrideC + << " BatchStrideA =" << BatchStrideA << " BatchStrideB =" << BatchStrideB + << " BatchStrideC =" << BatchStrideC << " BatchCount =" << BatchCount + << std::endl; + + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool pass = true; + + ck_tile::HostTensor c_m_n_host_ref( + f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{})); + c_m_n_host_ref.SetZero(); + + const auto b_n_k = b_k_n.transpose({0, 2, 1}); + ck_tile::reference_batched_gemm( + a_m_k, b_n_k, c_m_n_host_ref); + + pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref); + EXPECT_TRUE(pass); + } +};