mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[rocm-libraries] ROCm/rocm-libraries#4299 (commit 668cd49)
173 implement device grouped gemm fixed nk for rdna4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Proposed changes This PR adds an RDNA4 implementation of the device_grouped_gemm_fixed_nk instance library using for WMMA. The implementation is based on the existing DeviceGroupedGemm_Xdl_Fixed_NK design and reuses the same high-level structure, but replaces the XDL kernel with a WMMA-based one. It uses the GridwiseGemm_wmma_cshuffle_v3 kernel. At this stage, the focus is functional correctness and compatibility, not performance tuning. ## Technical Details - Device struct for grouped gemm fixed NK - Example code for the WMMA version - Unit tests for both new wmma implementation and the reference XDL code (previously missing) - Generic ck profiler interface with the purpose of calling unit tests. ## Checklist Please put an into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask. - [x] I have added tests relevant to the introduced functionality, and the unit tests are passing locally - [x] I have added the test to REGRESSION_TESTS list defined at the top of CMakeLists.txt in tests/CMakeLists.txt, **IF** the test takes more than 30 seconds to run. - [ ] I have added inline documentation which enables the maintainers with understanding the motivation - [ ] I have removed the stale documentation which is no longer relevant after this pull request - [x] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request - [x] I have run on all changed files - [x] Any dependent changes have been merged ## Discussion If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered
This commit is contained in:
committed by
assistant-librarian[bot]
parent
c5ce5eee5b
commit
7b97e197ef
@@ -18,6 +18,12 @@ if (CK_USE_XDL OR CK_USE_WMMA)
|
||||
target_link_libraries(test_grouped_gemm_fastgelu PRIVATE utility device_grouped_gemm_fastgelu_instance)
|
||||
add_dependencies(test_grouped_gemm test_grouped_gemm_fastgelu)
|
||||
endif()
|
||||
|
||||
add_gtest_executable(test_grouped_gemm_fixed_nk test_grouped_gemm_fixed_nk.cpp)
|
||||
if(result EQUAL 0)
|
||||
target_link_libraries(test_grouped_gemm_fixed_nk PRIVATE utility device_grouped_gemm_fixed_nk_instance)
|
||||
add_dependencies(test_grouped_gemm test_grouped_gemm_fixed_nk)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface_xdl.cpp)
|
||||
|
||||
84
test/grouped_gemm/test_grouped_gemm_fixed_nk.cpp
Normal file
84
test/grouped_gemm/test_grouped_gemm_fixed_nk.cpp
Normal file
@@ -0,0 +1,84 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_grouped_gemm_util.hpp"
|
||||
|
||||
ck::index_t param_mask = 0xffffff;
|
||||
ck::index_t instance_index = -1;
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F8 = ck::f8_t;
|
||||
using I8 = int8_t;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestGroupedGemm
|
||||
: public ck::test::TestGroupedGemm<Tuple, false, ck::test::FixedNKGroupedGemmProfiler>
|
||||
{
|
||||
public:
|
||||
void SetUp() override
|
||||
{
|
||||
ck::test::TestGroupedGemm<Tuple, false, ck::test::FixedNKGroupedGemmProfiler>::SetUp();
|
||||
|
||||
#if defined(CK_USE_WMMA)
|
||||
// The old XDL tests didn't fail if instances were not supported, so we want to keep that
|
||||
// behaviour When compiling WMMA instances and WMMA is supported, then we'll fail if a
|
||||
// specific case is not supported
|
||||
this->fail_if_no_supported_instances_ =
|
||||
ck::is_gfx11_supported() || ck::is_gfx12_supported();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
using KernelTypes = ::testing::Types<
|
||||
#if CK_USE_OCP_FP8 || CK_USE_FNUZ_FP8 || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || \
|
||||
defined(CK_USE_WMMA_FP8)
|
||||
ck::Tuple<Row, Row, Row, F16, F8, F16>,
|
||||
ck::Tuple<Row, Col, Row, F16, F8, F16>,
|
||||
#endif
|
||||
|
||||
ck::Tuple<Row, Row, Row, F16, F16, F16>,
|
||||
ck::Tuple<Row, Col, Row, F16, F16, F16>,
|
||||
|
||||
ck::Tuple<Row, Row, Row, BF16, BF16, BF16>,
|
||||
ck::Tuple<Row, Col, Row, BF16, BF16, BF16>,
|
||||
ck::Tuple<Row, Row, Row, BF16, I8, BF16>,
|
||||
ck::Tuple<Row, Col, Row, BF16, I8, BF16>,
|
||||
|
||||
ck::Tuple<Row, Row, Row, F16, I8, F16>,
|
||||
ck::Tuple<Row, Col, Row, F16, I8, F16>
|
||||
>;
|
||||
|
||||
// clang-format on
|
||||
|
||||
TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes);
|
||||
|
||||
#include "test_grouped_gemm_fixed_nk_cases.inc"
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
testing::InitGoogleTest(&argc, argv);
|
||||
if(argc == 1) {}
|
||||
else if(argc == 3)
|
||||
{
|
||||
param_mask = strtol(argv[1], nullptr, 0);
|
||||
instance_index = atoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Usage of " << argv[0] << std::endl;
|
||||
std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
|
||||
}
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
84
test/grouped_gemm/test_grouped_gemm_fixed_nk_cases.inc
Normal file
84
test/grouped_gemm/test_grouped_gemm_fixed_nk_cases.inc
Normal file
@@ -0,0 +1,84 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
TYPED_TEST(TestGroupedGemm, TinyCases)
|
||||
{
|
||||
const std::vector<int> Ms{2, 1};
|
||||
constexpr int N = 768;
|
||||
constexpr int K = 544;
|
||||
|
||||
const std::vector<int> Ns(Ms.size(), N);
|
||||
const std::vector<int> Ks(Ms.size(), K);
|
||||
|
||||
this->Run(Ms, Ns, Ks);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestGroupedGemm, SmallCases)
|
||||
{
|
||||
const std::vector<int> Ms{2, 1, 3, 4, 5};
|
||||
constexpr int N = 768;
|
||||
constexpr int K = 544;
|
||||
|
||||
const std::vector<int> Ns(Ms.size(), N);
|
||||
const std::vector<int> Ks(Ms.size(), K);
|
||||
|
||||
this->Run(Ms, Ns, Ks);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestGroupedGemm, MidCases)
|
||||
{
|
||||
const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
|
||||
constexpr int N = 768;
|
||||
constexpr int K = 544;
|
||||
|
||||
const std::vector<int> Ns(Ms.size(), N);
|
||||
const std::vector<int> Ks(Ms.size(), K);
|
||||
|
||||
this->Run(Ms, Ns, Ks);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestGroupedGemm, Regular)
|
||||
{
|
||||
const std::vector<int> Ms{64, 128, 256};
|
||||
constexpr int N = 768;
|
||||
constexpr int K = 320;
|
||||
|
||||
const std::vector<int> Ns(Ms.size(), N);
|
||||
const std::vector<int> Ks(Ms.size(), K);
|
||||
|
||||
this->Run(Ms, Ns, Ks);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestGroupedGemm, MNKPadded)
|
||||
{
|
||||
const std::vector<int> Ms{127, 150, 188, 210};
|
||||
constexpr int N = 136;
|
||||
constexpr int K = 280;
|
||||
|
||||
const std::vector<int> Ns(Ms.size(), N);
|
||||
const std::vector<int> Ks(Ms.size(), K);
|
||||
|
||||
this->Run(Ms, Ns, Ks);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestGroupedGemm, TestLargeKBatch)
|
||||
{
|
||||
// In some cases Split K is not supported. Running this test would fail since no instance will
|
||||
// be supported, so we skip the test
|
||||
if(!this->IsSplitKSupported())
|
||||
GTEST_SKIP() << "Split-K not supported for for the current configuration (FP16/BF16 on "
|
||||
"GFX11, or using CDE element-wise operation)";
|
||||
|
||||
const std::vector<int> Ms{188, 210};
|
||||
constexpr int N = 768;
|
||||
constexpr int K = 4096;
|
||||
|
||||
const std::vector<int> Ns(Ms.size(), N);
|
||||
const std::vector<int> Ks(Ms.size(), K);
|
||||
|
||||
this->k_batches_ = {32, 64};
|
||||
|
||||
this->Run(Ms, Ns, Ks);
|
||||
}
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
#include "profiler/profile_grouped_gemm_impl.hpp"
|
||||
#include "profiler/profile_grouped_gemm_fixed_nk_impl.hpp"
|
||||
|
||||
extern ck::index_t param_mask;
|
||||
extern ck::index_t instance_index;
|
||||
@@ -23,7 +24,124 @@ extern ck::index_t instance_index;
|
||||
namespace ck {
|
||||
namespace test {
|
||||
|
||||
template <typename Tuple, bool FailIfNoSupportedInstances = false>
|
||||
struct DefaultGroupedGemmProfiler
|
||||
{
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename EDataType,
|
||||
typename AccDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout,
|
||||
typename AElementOp,
|
||||
typename BElementOp,
|
||||
typename CDEElementOp>
|
||||
static bool Run(bool verify,
|
||||
int init_method,
|
||||
bool log,
|
||||
bool bench,
|
||||
const std::vector<int>& Ms,
|
||||
const std::vector<int>& Ns,
|
||||
const std::vector<int>& Ks,
|
||||
const std::vector<int>& StrideAs,
|
||||
const std::vector<int>& StrideBs,
|
||||
const std::vector<int>& StrideCs,
|
||||
const std::vector<int>& kbatches,
|
||||
int n_warmup,
|
||||
int n_iter,
|
||||
int instance_index,
|
||||
bool fail_if_no_supported_instances)
|
||||
{
|
||||
return ck::profiler::profile_grouped_gemm_impl<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
AccDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>(
|
||||
verify,
|
||||
init_method,
|
||||
log,
|
||||
bench,
|
||||
Ms,
|
||||
Ns,
|
||||
Ks,
|
||||
StrideAs,
|
||||
StrideBs,
|
||||
StrideCs,
|
||||
kbatches,
|
||||
n_warmup,
|
||||
n_iter,
|
||||
instance_index,
|
||||
fail_if_no_supported_instances);
|
||||
}
|
||||
};
|
||||
|
||||
struct FixedNKGroupedGemmProfiler
|
||||
{
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename EDataType,
|
||||
typename AccDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout>
|
||||
static bool Run(bool verify,
|
||||
int init_method,
|
||||
bool log,
|
||||
bool bench,
|
||||
const std::vector<int>& Ms,
|
||||
const std::vector<int>& Ns,
|
||||
const std::vector<int>& Ks,
|
||||
const std::vector<int>& StrideAs,
|
||||
const std::vector<int>& StrideBs,
|
||||
const std::vector<int>& StrideCs,
|
||||
const std::vector<int>& kbatches,
|
||||
int n_warmup,
|
||||
int n_iter,
|
||||
int /*instance_index*/,
|
||||
bool /*fail_if_no_supported_instances*/)
|
||||
{
|
||||
bool pass = true;
|
||||
for(int kbatch : kbatches)
|
||||
{
|
||||
try
|
||||
{
|
||||
pass &= ck::profiler::profile_grouped_gemm_fixed_nk_impl<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
AccDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout>(verify,
|
||||
init_method,
|
||||
log,
|
||||
bench,
|
||||
Ms,
|
||||
Ns,
|
||||
Ks,
|
||||
StrideAs,
|
||||
StrideBs,
|
||||
StrideCs,
|
||||
kbatch,
|
||||
n_warmup,
|
||||
n_iter);
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
return pass;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Tuple,
|
||||
bool FailIfNoSupportedInstances = false,
|
||||
typename Profiler = ck::test::DefaultGroupedGemmProfiler>
|
||||
class TestGroupedGemm : public testing::Test
|
||||
{
|
||||
protected:
|
||||
@@ -76,7 +194,7 @@ class TestGroupedGemm : public testing::Test
|
||||
}
|
||||
else
|
||||
{
|
||||
k_batches_ = {1, 2, 3, 5, 8};
|
||||
k_batches_ = {1, 2, 3, 4, 8};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,31 +264,61 @@ class TestGroupedGemm : public testing::Test
|
||||
const std::vector<int>& StrideCs,
|
||||
const std::vector<int>& kbatches)
|
||||
{
|
||||
bool pass =
|
||||
ck::profiler::profile_grouped_gemm_impl<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
float,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>(verify_,
|
||||
init_method_,
|
||||
log_,
|
||||
bench_,
|
||||
Ms,
|
||||
Ns,
|
||||
Ks,
|
||||
StrideAs,
|
||||
StrideBs,
|
||||
StrideCs,
|
||||
kbatches,
|
||||
n_warmup_,
|
||||
n_iter_,
|
||||
instance_index,
|
||||
fail_if_no_supported_instances_);
|
||||
bool pass = false;
|
||||
using AccDataType = float;
|
||||
|
||||
if constexpr(std::is_same_v<Profiler, FixedNKGroupedGemmProfiler>)
|
||||
{
|
||||
pass = Profiler::template Run<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
AccDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout>(verify_,
|
||||
init_method_,
|
||||
log_,
|
||||
bench_,
|
||||
Ms,
|
||||
Ns,
|
||||
Ks,
|
||||
StrideAs,
|
||||
StrideBs,
|
||||
StrideCs,
|
||||
kbatches,
|
||||
n_warmup_,
|
||||
n_iter_,
|
||||
instance_index,
|
||||
fail_if_no_supported_instances_);
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = Profiler::template Run<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
AccDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>(verify_,
|
||||
init_method_,
|
||||
log_,
|
||||
bench_,
|
||||
Ms,
|
||||
Ns,
|
||||
Ks,
|
||||
StrideAs,
|
||||
StrideBs,
|
||||
StrideCs,
|
||||
kbatches,
|
||||
n_warmup_,
|
||||
n_iter_,
|
||||
instance_index,
|
||||
fail_if_no_supported_instances_);
|
||||
}
|
||||
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user