mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 11:30:02 +00:00
implement device batched gemm b scale for wmma (#2825)
* rebased on top of develop
* fixed missing shuffeling and wrong indexing
* added tests for batched_b_scale
* added missing files
* fixed wrong stride computation and removed k batching (for now) due to precision issues
* reinstated k-batching with PRNG constrained to -1..1
* added specialization of GeneratorTensor_3 for int4 and fixed internal overflow
* added k-batching to reference and increased tolerances for test
* changed gemm_b_scale and gemm_universal tests to use correct parameters
* adressed review commentsd
* ported fixes back to non-batched version of b_scale
* adressed review comments
* run clang-format on older commits
* add type-conversion to AccDataType and then to CDataType to exactly mimic GPU's behavior
* added newline at end of file
* reflected changes from muitl-abd branch in batched b_scale
* fixed gfx11 issue
* changed range for pki4 to -1...1 (-0.5...0.5 never really made sense for i4 anyway and always should have caused compiler errors, but since there was no int4 specialization of GeneratorTensor3 until now, this passed
* run clang format
* set range of i4 generation to 0...1 for upstream tests to pass. This replicated previous behavior, which however means that it is NOT properly tested.
* reduced range for pk_i4 even further to 0..0
* removed failing xld instances. Failure now uncovered now that tests were fixed
* removed generation of int4 values entierly
* divide B buffer by BPackedSize
---------
Co-authored-by: Kevin Abraham <kevin.abraham@streamhpc.com>
[ROCm/composable_kernel commit: c4b2da9cbd]
This commit is contained in:
@@ -24,6 +24,7 @@ set(REGRESSION_TESTS
|
||||
test_batched_gemm_softmax_gemm_permute_bf16
|
||||
test_batched_gemm_bias_softmax_gemm_permute_bf16
|
||||
test_grouped_gemm_splitk
|
||||
test_batched_gemm_b_scale_wmma
|
||||
test_reduce_no_index
|
||||
test_reduce_with_index
|
||||
test_convnd_fwd
|
||||
@@ -257,6 +258,7 @@ add_subdirectory(batched_gemm_reduce)
|
||||
add_subdirectory(batched_gemm_gemm)
|
||||
add_subdirectory(batched_gemm_softmax_gemm)
|
||||
add_subdirectory(batched_gemm_softmax_gemm_permute)
|
||||
add_subdirectory(batched_gemm_b_scale)
|
||||
add_subdirectory(grouped_gemm)
|
||||
add_subdirectory(reduce)
|
||||
add_subdirectory(convnd_fwd)
|
||||
|
||||
5
test/batched_gemm_b_scale/CMakeLists.txt
Normal file
5
test/batched_gemm_b_scale/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
|
||||
add_gtest_executable(test_batched_gemm_b_scale_wmma test_batched_gemm_b_scale_wmma.cpp)
|
||||
if(result EQUAL 0)
|
||||
target_link_libraries(test_batched_gemm_b_scale_wmma PRIVATE utility device_batched_gemm_b_scale_instance)
|
||||
endif()
|
||||
@@ -0,0 +1,49 @@
|
||||
#pragma once
|
||||
|
||||
TYPED_TEST(TestBatchedGemmBScale_MK_NK, SmallM)
|
||||
{
|
||||
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
|
||||
constexpr int N = 256;
|
||||
constexpr int K = 1024;
|
||||
|
||||
constexpr int StrideA = K;
|
||||
constexpr int StrideB = K;
|
||||
constexpr int StrideC = N;
|
||||
|
||||
constexpr int NBatches = 10;
|
||||
|
||||
for(int M : Ms)
|
||||
this->Run(M, N, K, StrideA, StrideB, StrideC, NBatches);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestBatchedGemmBScale_MK_NK, MidLargeM)
|
||||
{
|
||||
std::vector<int> Ms{127, 255, 312, 799, 1573};
|
||||
constexpr int N = 512;
|
||||
constexpr int K = 768;
|
||||
|
||||
constexpr int StrideA = K;
|
||||
constexpr int StrideB = K;
|
||||
constexpr int StrideC = N;
|
||||
|
||||
constexpr int NBatches = 7;
|
||||
|
||||
for(int M : Ms)
|
||||
this->Run(M, N, K, StrideA, StrideB, StrideC, NBatches);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestBatchedGemmBScale_MK_NK, Regular)
|
||||
{
|
||||
std::vector<int> Ms{512, 1024};
|
||||
constexpr int N = 512;
|
||||
constexpr int K = 1024;
|
||||
|
||||
constexpr int StrideA = K;
|
||||
constexpr int StrideB = K;
|
||||
constexpr int StrideC = N;
|
||||
|
||||
constexpr int NBatches = 3;
|
||||
|
||||
for(int M : Ms)
|
||||
this->Run(M, N, K, StrideA, StrideB, StrideC, NBatches);
|
||||
}
|
||||
108
test/batched_gemm_b_scale/test_batched_gemm_b_scale_util.hpp
Normal file
108
test/batched_gemm_b_scale/test_batched_gemm_b_scale_util.hpp
Normal file
@@ -0,0 +1,108 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "include/ck/utility/data_type.hpp"
|
||||
#include "profiler/profile_batched_gemm_b_scale_impl.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace test {
|
||||
|
||||
template <typename Tuple>
|
||||
class TestBatchedGemmBScale : public testing::Test
|
||||
{
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using F32 = float;
|
||||
|
||||
protected:
|
||||
using ALayout = std::tuple_element_t<0, Tuple>;
|
||||
using BLayout = std::tuple_element_t<1, Tuple>;
|
||||
using CLayout = Row;
|
||||
using ADataType = std::tuple_element_t<2, Tuple>;
|
||||
using BDataType = std::tuple_element_t<3, Tuple>;
|
||||
using BScaleDataType = std::tuple_element_t<4, Tuple>;
|
||||
using ComputeDataType = std::tuple_element_t<5, Tuple>;
|
||||
using CDataType = std::tuple_element_t<6, Tuple>;
|
||||
|
||||
public:
|
||||
static constexpr ck::index_t ScaleBlockK = 128; // all instances
|
||||
static constexpr bool verify_ = true;
|
||||
static constexpr int init_method_ = 2;
|
||||
static constexpr bool log_ = false;
|
||||
static constexpr bool bench_ = false; // measure kernel performance
|
||||
std::vector<int> k_batches_;
|
||||
|
||||
void SetUp() override { k_batches_ = {1, 2}; }
|
||||
|
||||
void Run(const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const int StrideA,
|
||||
const int StrideB,
|
||||
const int StrideC,
|
||||
const int NBatch)
|
||||
{
|
||||
for(auto kb : k_batches_)
|
||||
{
|
||||
RunSingle(M, N, K, StrideA, StrideB, StrideC, NBatch, kb);
|
||||
}
|
||||
}
|
||||
|
||||
void RunSingle(const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const int StrideA,
|
||||
const int StrideB,
|
||||
const int StrideC,
|
||||
const int Nbatch,
|
||||
int kbatch = 1,
|
||||
int n_warmup = 1,
|
||||
int n_iter = 10)
|
||||
{
|
||||
const int BatchStrideA = StrideA * M;
|
||||
const int BatchStrideB = StrideB * K;
|
||||
const int BatchStrideC = StrideC * M;
|
||||
const int BatchStrideScaleB = StrideB * K;
|
||||
bool pass = ck::profiler::profile_batched_gemm_b_scale_impl<ADataType,
|
||||
BDataType,
|
||||
BScaleDataType,
|
||||
ComputeDataType,
|
||||
F32,
|
||||
CDataType,
|
||||
ScaleBlockK,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout>(verify_,
|
||||
init_method_,
|
||||
log_,
|
||||
bench_,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
BatchStrideA,
|
||||
BatchStrideB,
|
||||
BatchStrideC,
|
||||
BatchStrideScaleB,
|
||||
Nbatch,
|
||||
kbatch,
|
||||
n_warmup,
|
||||
n_iter);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace test
|
||||
} // namespace ck
|
||||
45
test/batched_gemm_b_scale/test_batched_gemm_b_scale_wmma.cpp
Normal file
45
test/batched_gemm_b_scale/test_batched_gemm_b_scale_wmma.cpp
Normal file
@@ -0,0 +1,45 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "test_batched_gemm_b_scale_util.hpp"
|
||||
|
||||
using I4 = ck::pk_i4_t;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename X, typename Y>
|
||||
struct tuple_concat;
|
||||
|
||||
template <typename... Xs, typename... Ys>
|
||||
struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
|
||||
{
|
||||
using type = std::tuple<Xs..., Ys...>;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
template <typename Tuple>
|
||||
class TestBatchedGemmBScale_MK_NK : public ck::test::TestBatchedGemmBScale<
|
||||
typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
|
||||
{
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
using KernelTypes_MK_NK = ::testing::Types<
|
||||
// ADataType, BDataType, BScaleDataType, ComputeDataType, CDataType
|
||||
std::tuple< F16, I4, F16, F16, F16>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
TYPED_TEST_SUITE(TestBatchedGemmBScale_MK_NK, KernelTypes_MK_NK);
|
||||
|
||||
#include "test_batched_gemm_b_scale_ut_cases.inc"
|
||||
Reference in New Issue
Block a user