Merge commit '507d81c3af51b81f15b946a2a4bef7f594620292' into develop

This commit is contained in:
assistant-librarian[bot]
2025-11-03 20:14:18 +00:00
parent 7ce8c0cf8f
commit a8059a2e58
19 changed files with 777 additions and 172 deletions

View File

@@ -245,10 +245,13 @@ add_subdirectory(conv_util)
add_subdirectory(reference_conv_fwd)
add_subdirectory(gemm)
add_subdirectory(gemm_add)
add_subdirectory(gemm_blockscale_wp)
add_subdirectory(gemm_layernorm)
add_subdirectory(gemm_multi_abd)
add_subdirectory(gemm_multiply_multiply_wp)
add_subdirectory(gemm_split_k)
add_subdirectory(gemm_universal)
add_subdirectory(gemm_universal_preshuffle)
add_subdirectory(gemm_b_scale)
add_subdirectory(gemm_universal_streamk)
add_subdirectory(gemm_reduce)

View File

@@ -0,0 +1,6 @@
if(GPU_TARGETS MATCHES "gfx9[45]|gfx12")
add_gtest_executable(test_gemm_blockscale_wp_xdl_fp8 test_gemm_blockscale_wp_xdl_fp8.cpp)
if(result EQUAL 0)
target_link_libraries(test_gemm_blockscale_wp_xdl_fp8 PRIVATE utility device_gemm_blockscale_wp_instance)
endif()
endif()

View File

@@ -0,0 +1,64 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_gemm_common.hpp"
using F8 = ck::f8_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
namespace {
template <typename X, typename Y>
struct tuple_concat;
template <typename... Xs, typename... Ys>
struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
{
using type = std::tuple<Xs..., Ys...>;
};
} // namespace
template <typename Tuple>
class TestGemmBlockScaleWP_FP8_MK_NK : public ck::test::TestGemmBlockscaleWPCommon<
typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
{
};
// clang-format off
using KernelTypes_MK_NK = ::testing::Types<
#if defined(CK_ENABLE_FP8)
std::tuple< F8, F32, F8, F32, F8, BF16>
#endif
>;
// clang-format on
TYPED_TEST_SUITE(TestGemmBlockScaleWP_FP8_MK_NK, KernelTypes_MK_NK);
TYPED_TEST(TestGemmBlockScaleWP_FP8_MK_NK, Regular0)
{
std::vector<int> Ms{128, 256, 512};
constexpr int N = 512;
constexpr int K = 2048;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestGemmBlockScaleWP_FP8_MK_NK, Regular1)
{
std::vector<int> Ms{128, 256, 512};
constexpr int N = 1024;
constexpr int K = 4096;
for(int M : Ms)
this->Run(M, N, K);
}

View File

@@ -0,0 +1,77 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/ck.hpp"
#include "profiler/profile_gemm_blockscale_wp_impl.hpp"
namespace ck {
namespace test {
using Row = ck::tensor_layout::gemm::RowMajor;
using F32 = float;
template <typename Tuple>
class TestGemmBlockscaleWPCommon : public ::testing::Test
{
protected:
using ALayout = std::tuple_element_t<0, Tuple>;
using BLayout = std::tuple_element_t<1, Tuple>;
using CLayout = Row;
using A0DataType = std::tuple_element_t<2, Tuple>;
using A1DataType = std::tuple_element_t<3, Tuple>;
using B0DataType = std::tuple_element_t<4, Tuple>;
using B1DataType = std::tuple_element_t<5, Tuple>;
using ComputeDataType = std::tuple_element_t<6, Tuple>;
using CDataType = std::tuple_element_t<7, Tuple>;
public:
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1;
static constexpr bool log_ = false;
static constexpr bool bench_ = false;
static constexpr index_t ScaleBlockM = 1;
static constexpr index_t ScaleBlockN = 128;
static constexpr index_t ScaleBlockK = 128;
void Run(const int M, const int N, const int K, int n_warmup = 1, int n_iter = 10)
{
bool all_success = true;
int StrideA = std::is_same_v<ALayout, Row> ? K : M;
int StrideB = std::is_same_v<BLayout, Row> ? N : K;
int StrideC = std::is_same_v<CLayout, Row> ? N : M;
all_success =
all_success &
ck::profiler::profile_gemm_blockscale_weightpreshuffle_impl<A0DataType,
A1DataType,
B0DataType,
B1DataType,
ComputeDataType,
F32,
CDataType,
ScaleBlockM,
ScaleBlockN,
ScaleBlockK,
ALayout,
BLayout,
CLayout>(verify_,
init_method_,
log_,
bench_,
M,
N,
K,
StrideA,
StrideB,
StrideC,
n_warmup,
n_iter);
EXPECT_TRUE(all_success);
}
};
} // namespace test
} // namespace ck

View File

@@ -0,0 +1,6 @@
if(GPU_TARGETS MATCHES "gfx9[45]|gfx12")
add_gtest_executable(test_gemm_multiply_multiply_wp_xdl_fp8 test_gemm_multiply_multiply_wp_xdl_fp8.cpp)
if(result EQUAL 0)
target_link_libraries(test_gemm_multiply_multiply_wp_xdl_fp8 PRIVATE utility device_gemm_multiply_multiply_wp_instance)
endif()
endif()

View File

@@ -0,0 +1,93 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/ck.hpp"
#include "profiler/profile_gemm_multiply_multiply_wp_impl.hpp"
namespace ck {
namespace test {
using Row = ck::tensor_layout::gemm::RowMajor;
using F32 = float;
template <typename Tuple>
class TestGemmMultiplyMultiplyWPCommon : public ::testing::Test
{
protected:
using ALayout = std::tuple_element_t<0, Tuple>;
using BLayout = std::tuple_element_t<1, Tuple>;
using D0Layout = std::tuple_element_t<2, Tuple>;
using D1Layout = std::tuple_element_t<3, Tuple>;
using ELayout = Row;
using ADataType = std::tuple_element_t<4, Tuple>;
using BDataType = std::tuple_element_t<5, Tuple>;
using ComputeDataType = std::tuple_element_t<6, Tuple>;
using D0DataType = std::tuple_element_t<7, Tuple>;
using D1DataType = std::tuple_element_t<8, Tuple>;
using EDataType = std::tuple_element_t<9, Tuple>;
public:
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1; // decimal value initialization
static constexpr bool log_ = false;
static constexpr bool bench_ = false; // measure kernel performance
std::vector<int> k_batches_;
void SetUp() override { k_batches_ = {1, 2, 4}; }
void Run(const int M, const int N, const int K)
{
for(size_t i = 0; i < k_batches_.size(); i++)
{
RunSingle(M, N, K, k_batches_[i]);
}
}
void RunSingle(
const int M, const int N, const int K, int kbatch = 1, int n_warmup = 1, int n_iter = 10)
{
bool all_success = true;
int StrideA = std::is_same_v<remove_cvref_t<ALayout>, Row> ? K : M;
int StrideB = std::is_same_v<remove_cvref_t<BLayout>, Row> ? N : K;
int StrideD0 = std::is_same_v<remove_cvref_t<D0Layout>, Row> ? N : M;
int StrideD1 = std::is_same_v<remove_cvref_t<D1Layout>, Row> ? N : M;
int StrideE = std::is_same_v<ELayout, Row> ? N : M;
all_success =
all_success &
ck::profiler::profile_gemm_multiply_multiply_weight_preshuffle_impl<ADataType,
BDataType,
ComputeDataType,
F32,
D0DataType,
D1DataType,
EDataType,
ALayout,
BLayout,
D0Layout,
D1Layout,
ELayout>(
verify_,
init_method_,
log_,
bench_,
M,
N,
K,
StrideA,
StrideB,
StrideD0,
StrideD1,
StrideE,
kbatch,
n_warmup,
n_iter);
EXPECT_TRUE(all_success);
}
};
} // namespace test
} // namespace ck

View File

@@ -0,0 +1,77 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_gemm_common.hpp"
using F8 = ck::f8_t;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
namespace {
template <typename X, typename Y>
struct tuple_concat;
template <typename... Xs, typename... Ys>
struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
{
using type = std::tuple<Xs..., Ys...>;
};
} // namespace
template <typename Tuple>
class TestGemmMultiplyMultiplyWP_FP8_MK_NK
: public ck::test::TestGemmMultiplyMultiplyWPCommon<
typename tuple_concat<std::tuple<Row, Col, Row, Col>, Tuple>::type>
{
};
// clang-format off
using KernelTypes_MK_NK = ::testing::Types<
#if defined(CK_ENABLE_FP8)
std::tuple< F8, F8, F8, F32, F32, F16>,
std::tuple< F8, F8, F8, F32, F32, BF16>
#endif
>;
// clang-format on
TYPED_TEST_SUITE(TestGemmMultiplyMultiplyWP_FP8_MK_NK, KernelTypes_MK_NK);
TYPED_TEST(TestGemmMultiplyMultiplyWP_FP8_MK_NK, Regular0)
{
std::vector<int> Ms{128, 224, 256, 448, 512};
constexpr int N = 512;
constexpr int K = 2048;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestGemmMultiplyMultiplyWP_FP8_MK_NK, Regular1)
{
std::vector<int> Ms{128, 224, 256, 448, 512};
constexpr int N = 1024;
constexpr int K = 4096;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestGemmMultiplyMultiplyWP_FP8_MK_NK, Regular2)
{
std::vector<int> Ms{128, 256, 512};
constexpr int N = 448;
constexpr int K = 2048;
for(int M : Ms)
this->Run(M, N, K);
}

View File

@@ -0,0 +1,6 @@
if(GPU_TARGETS MATCHES "gfx9[45]|gfx12")
add_gtest_executable(test_gemm_universal_preshuffle_xdl_fp8 test_gemm_universal_preshuffle_xdl_fp8.cpp)
if(result EQUAL 0)
target_link_libraries(test_gemm_universal_preshuffle_xdl_fp8 PRIVATE utility device_gemm_universal_preshuffle_instance)
endif()
endif()

View File

@@ -0,0 +1,79 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/ck.hpp"
#include "profiler/profile_gemm_universal_preshuffle_impl.hpp"
namespace ck {
namespace test {
using Row = ck::tensor_layout::gemm::RowMajor;
using F32 = float;
template <typename Tuple>
class TestGemmUniversalPreshuffleCommon : public ::testing::Test
{
protected:
using ALayout = std::tuple_element_t<0, Tuple>;
using BLayout = std::tuple_element_t<1, Tuple>;
using CLayout = Row;
using ADataType = std::tuple_element_t<2, Tuple>;
using BDataType = std::tuple_element_t<3, Tuple>;
using ComputeDataType = std::tuple_element_t<4, Tuple>;
using CDataType = std::tuple_element_t<5, Tuple>;
public:
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1;
static constexpr bool log_ = false;
static constexpr bool bench_ = false;
std::vector<int> k_batches_;
void SetUp() override { k_batches_ = {1, 2, 4}; }
void Run(const int M, const int N, const int K)
{
for(size_t i = 0; i < k_batches_.size(); i++)
{
RunSingle(M, N, K, k_batches_[i]);
}
}
void RunSingle(
const int M, const int N, const int K, int kbatch = 1, int n_warmup = 1, int n_iter = 10)
{
bool all_success = true;
int StrideA = std::is_same_v<ALayout, Row> ? K : M;
int StrideB = std::is_same_v<BLayout, Row> ? N : K;
int StrideC = std::is_same_v<CLayout, Row> ? N : M;
all_success = all_success &
ck::profiler::profile_gemm_universal_preshuffle_impl<ADataType,
BDataType,
ComputeDataType,
F32,
CDataType,
ALayout,
BLayout,
CLayout>(verify_,
init_method_,
log_,
bench_,
M,
N,
K,
StrideA,
StrideB,
StrideC,
kbatch,
n_warmup,
n_iter);
EXPECT_TRUE(all_success);
}
};
} // namespace test
} // namespace ck

View File

@@ -0,0 +1,77 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_gemm_common.hpp"
using F8 = ck::f8_t;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
namespace {
template <typename X, typename Y>
struct tuple_concat;
template <typename... Xs, typename... Ys>
struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
{
using type = std::tuple<Xs..., Ys...>;
};
} // namespace
template <typename Tuple>
class TestGemmUniversalPreshuffle_FP8_MK_NK
: public ck::test::TestGemmUniversalPreshuffleCommon<
typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
{
};
// clang-format off
using KernelTypes_MK_NK = ::testing::Types<
#if defined(CK_ENABLE_FP8)
std::tuple< F8, F8, F8, F16>,
std::tuple< F8, F8, F8, BF16>
#endif
>;
// clang-format on
TYPED_TEST_SUITE(TestGemmUniversalPreshuffle_FP8_MK_NK, KernelTypes_MK_NK);
TYPED_TEST(TestGemmUniversalPreshuffle_FP8_MK_NK, Regular0)
{
std::vector<int> Ms{128, 224, 256, 448, 512};
constexpr int N = 512;
constexpr int K = 2048;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestGemmUniversalPreshuffle_FP8_MK_NK, Regular1)
{
std::vector<int> Ms{128, 224, 256, 448, 512};
constexpr int N = 1024;
constexpr int K = 4096;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestGemmUniversalPreshuffle_FP8_MK_NK, Regular2)
{
std::vector<int> Ms{128, 256, 512};
constexpr int N = 448;
constexpr int K = 2048;
for(int M : Ms)
this->Run(M, N, K);
}