Files
composable_kernel/test/grouped_gemm/test_grouped_gemm_interface.cpp
Adam Osewski 70e4eb567f Multiple fixes to GroupedGemm+SplitK (#707)
* Add license header.

* Reduce number of logged output. Add constant initialization.

* Add functional tests for grouped_gemm with different kbatch value.

* Add debug log informations + remove unused code.

* Don't pass kbatch to CalculateKPadded.

* Turn on logging in grouped gemm and gemm splitk profiler

* Debug: limit number of test cases to run;

* Log more information and initialize with constant value.

* Turn on DEBUG_LOG

* Add more debug log informations.

* Limit the number of instances to compile.

* Use GridwiseGemmPipeline

* Use KBatch to calculate K0

* Multiple DebugLog messages.

* Unit tests for multiple KBatch values.

* Refactoring

* Disable logging
* extract out of if statement KBatch update.

* Uncomment instances.

* Disable DebugLog.

* Use Kbatch when calculate KPadded.

* Fix CGridDesc padding.

* Use available helper functions.

* Uncomment code commented for debuggin.

* Remove unnecessary debug log messages.

* Uncomment previously commented code for debug purposes.

* Add KBatch info to profiler output summary log.

* Add gtests for gemm splitk using ckProfiler API.

* Add more test-cases for different data layout.

* Add more test cases for gemm splitk

* Remove old test.

* Unit tests for MKNK ggemm interface.

* Fix and add more unit-tests.

* Constepxr everything!

* Increase error threshold for fp16 and splitk.

Since we're using fp16 atomic add for splitk there's a
known precision loss.

---------

Co-authored-by: Adam Osewski <aosewski@amd.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
2023-05-30 07:09:06 -05:00

203 lines
7.8 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <stdexcept>
#include <vector>
#include "gtest/gtest.h"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "test_grouped_gemm_util.hpp"
class TestGGemmSplitKInterface_MKNKMN : public ::testing::Test
{
protected:
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
ck::index_t KPerBlock,
ck::index_t K1,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t CDEBlockTransferScalarPerVector_NPerBlock>
using GGemmInstance =
ck::test::DeviceGroupedGemmSplitkInstanceWrapper<ALayout,
BLayout,
ELayout,
GemmSpec,
KPerBlock,
K1,
ABlockTransferSrcScalarPerVector,
BBlockTransferSrcScalarPerVector,
CDEBlockTransferScalarPerVector_NPerBlock>;
using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 8>;
};
TEST_F(TestGGemmSplitKInterface_MKNKMN, TileSize)
{
std::vector<int> Ms{128, 256, 188, 512};
constexpr int N = 256;
constexpr int K = 128;
std::vector<int> Ns(Ms.size(), N);
std::vector<int> Ks(Ms.size(), K);
std::vector<int> StrideAs(Ms.size(), K);
std::vector<int> StrideBs(Ms.size(), K);
std::vector<int> StrideCs(Ms.size(), N);
// M % MPerBlock
EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
Ms = std::vector<int>{256, 128, 128, 512};
Ns = std::vector<int>{256, 177, 128, 512};
// N % NPerBlock
EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
}
TEST_F(TestGGemmSplitKInterface_MKNKMN, VectorLoadWidth)
{
static constexpr auto GemmMNKPadding =
ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 4, 8, 8>;
std::vector<int> Ms{128, 256, 256, 512};
constexpr int N = 256;
constexpr int K = 512;
std::vector<int> Ns(Ms.size(), N);
std::vector<int> Ks(Ms.size(), K);
std::vector<int> StrideAs(Ms.size(), K);
std::vector<int> StrideBs(Ms.size(), K);
std::vector<int> StrideCs(Ms.size(), N);
// K % ABlockTransferSrcScalarPerVector
Ks = std::vector<int>{256, 177, 128, 512};
EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
Ks = std::vector<int>{256, 164, 128, 512};
// K % BBlockTransferSrcScalarPerVector
EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
Ks = std::vector<int>(4, 128);
Ns = std::vector<int>{256, 127, 128, 512};
// N % CBlockTransferScalarPerVector_NWaveNPerXDL
EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
}
TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)
{
std::vector<int> Ms{128, 256, 256, 512};
constexpr int N = 256;
constexpr int K = 128;
constexpr int kbatch = 4;
std::vector<int> Ns(Ms.size(), N);
std::vector<int> Ks(Ms.size(), K);
std::vector<int> StrideAs(Ms.size(), K);
std::vector<int> StrideBs(Ms.size(), K);
std::vector<int> StrideCs(Ms.size(), N);
// kloops % 2
Ks = std::vector<int>{256, 512, 320, 768};
EXPECT_FALSE(
DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
// Not all gemms have same value for main_k0_block_loop!
Ks = std::vector<int>{256, 512, 512, 512};
EXPECT_THROW(DefaultGGemmInstance{}.Run(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch),
std::runtime_error);
}
class TestGGemmSplitKInterface_KMKNNM : public ::testing::Test
{
protected:
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using ALayout = Col;
using BLayout = Row;
using ELayout = Col;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
ck::index_t KPerBlock,
ck::index_t K1,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t CDEBlockTransferScalarPerVector_NPerBlock>
using GGemmInstance =
ck::test::DeviceGroupedGemmSplitkInstanceWrapper<ALayout,
BLayout,
ELayout,
GemmSpec,
KPerBlock,
K1,
ABlockTransferSrcScalarPerVector,
BBlockTransferSrcScalarPerVector,
CDEBlockTransferScalarPerVector_NPerBlock>;
using DefaultGGemmInstance = GGemmInstance<GemmDefault, 32, 8, 4, 8, 4>;
};
TEST_F(TestGGemmSplitKInterface_KMKNNM, TileSize)
{
std::vector<int> Ms{128, 256, 188, 512};
constexpr int N = 256;
constexpr int K = 128;
std::vector<int> Ns(Ms.size(), N);
std::vector<int> Ks(Ms.size(), K);
std::vector<int> StrideAs(Ms.size(), K);
std::vector<int> StrideBs(Ms.size(), K);
std::vector<int> StrideCs(Ms.size(), N);
// M % MPerBlock
EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
Ms = std::vector<int>{128, 256, 256, 512};
Ns = std::vector<int>{256, 177, 128, 512};
// N % NPerBlock
EXPECT_FALSE(DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
}
TEST_F(TestGGemmSplitKInterface_KMKNNM, VectorLoadWidth)
{
static constexpr auto GemmMNKPadding =
ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using PaddedGGemmInstance = GGemmInstance<GemmMNKPadding, 32, 8, 2, 8, 4>;
std::vector<int> Ms{128, 256, 256, 512};
constexpr int N = 256;
constexpr int K = 512;
std::vector<int> Ns(Ms.size(), N);
std::vector<int> Ks(Ms.size(), K);
std::vector<int> StrideAs(Ms.size(), K);
std::vector<int> StrideBs(Ms.size(), K);
std::vector<int> StrideCs(Ms.size(), N);
// M % ABlockTransferSrcScalarPerVector
Ms = std::vector<int>{256, 177, 128, 512};
EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
Ms = std::vector<int>{128, 256, 256, 512};
Ns = std::vector<int>{256, 164, 128, 512};
// N % BBlockTransferSrcScalarPerVector
EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
Ns = std::vector<int>{128, 256, 256, 512};
Ms = std::vector<int>{256, 130, 128, 512};
// M % CBlockTransferScalarPerVector_NWaveNPerXDL
EXPECT_FALSE(PaddedGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs));
}