Merge commit 'ce99cab6056d1ffef5acb6f4ad7ede87a46a3cfc' into develop

This commit is contained in:
assistant-librarian[bot]
2025-12-11 08:17:07 +00:00
parent a1037bfc3c
commit b69f9eb589
84 changed files with 6712 additions and 1594 deletions

View File

@@ -261,6 +261,7 @@ add_subdirectory(gemm_multiply_multiply_wp)
add_subdirectory(gemm_split_k)
add_subdirectory(gemm_universal)
add_subdirectory(gemm_universal_preshuffle)
add_subdirectory(gemm_ab_scale)
add_subdirectory(gemm_b_scale)
add_subdirectory(gemm_universal_streamk)
add_subdirectory(gemm_reduce)

View File

@@ -0,0 +1,9 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
if(GPU_TARGETS MATCHES "gfx9[45]|gfx12")
add_gtest_executable(test_gemm_ab_scale test_gemm_ab_scale.cpp)
if(result EQUAL 0)
target_link_libraries(test_gemm_ab_scale PRIVATE utility device_gemm_ab_scale_instance)
endif()
endif()

View File

@@ -0,0 +1,236 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <tuple>
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_gemm_ab_scale_util.hpp"
using BF16 = ck::bhalf_t;
using F32 = float;
using F8 = ck::f8_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
namespace {
template <typename X, typename Y>
struct tuple_concat;
template <typename... Xs, typename... Ys>
struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
{
using type = std::tuple<Xs..., Ys...>;
};
} // namespace
template <typename Tuple>
class TestGemmABScale_MK_NK : public ck::test::TestGemmABScale<
typename tuple_concat<std::tuple<Row, Col, Row>, Tuple>::type>
{
};
template <typename Tuple>
class TestGemmABScale_MK_KN : public ck::test::TestGemmABScale<
typename tuple_concat<std::tuple<Row, Row, Row>, Tuple>::type>
{
};
template <typename Tuple>
class TestGemmABScale_KM_KN : public ck::test::TestGemmABScale<
typename tuple_concat<std::tuple<Col, Row, Row>, Tuple>::type>
{
};
// clang-format off
using KernelTypes = ::testing::Types<
// ADataType, BDataType, ComputeDataType, EDataType
std::tuple< F8, F32, F8, F32, F8, BF16>
>;
// clang-format on
TYPED_TEST_SUITE(TestGemmABScale_MK_NK, KernelTypes);
TYPED_TEST_SUITE(TestGemmABScale_MK_KN, KernelTypes);
TYPED_TEST_SUITE(TestGemmABScale_KM_KN, KernelTypes);
// Row Col
TYPED_TEST(TestGemmABScale_MK_NK, SmallM)
{
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideA = K;
constexpr int StrideB = K;
constexpr int StrideC = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
TYPED_TEST(TestGemmABScale_MK_NK, SmallMPadK)
{
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512;
constexpr int K = 704;
constexpr int StrideA = K;
constexpr int StrideB = K;
constexpr int StrideC = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
TYPED_TEST(TestGemmABScale_MK_NK, MidLargeM)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideA = K;
constexpr int StrideB = K;
constexpr int StrideC = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
TYPED_TEST(TestGemmABScale_MK_NK, Regular)
{
std::vector<int> Ms{512};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideA = K;
constexpr int StrideB = K;
constexpr int StrideE = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideE);
}
// Row Row
TYPED_TEST(TestGemmABScale_MK_KN, SmallM)
{
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideA = K;
constexpr int StrideB = N;
constexpr int StrideC = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
TYPED_TEST(TestGemmABScale_MK_KN, SmallMPadK)
{
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512;
constexpr int K = 704;
constexpr int StrideA = K;
constexpr int StrideB = N;
constexpr int StrideC = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
TYPED_TEST(TestGemmABScale_MK_KN, MidLargeM)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideA = K;
constexpr int StrideB = N;
constexpr int StrideC = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
TYPED_TEST(TestGemmABScale_MK_KN, Regular)
{
std::vector<int> Ms{512};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideA = K;
constexpr int StrideB = N;
constexpr int StrideE = N;
for(int M : Ms)
this->Run(M, N, K, StrideA, StrideB, StrideE);
}
// Col Row
TYPED_TEST(TestGemmABScale_KM_KN, SmallM)
{
std::vector<int> Ms{16, 32};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideB = N;
constexpr int StrideC = N;
for(int M : Ms)
{
int StrideA = M;
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
}
TYPED_TEST(TestGemmABScale_KM_KN, SmallMPadK)
{
std::vector<int> Ms{16, 32};
constexpr int N = 512;
constexpr int K = 704;
constexpr int StrideB = N;
constexpr int StrideC = N;
for(int M : Ms)
{
int StrideA = M;
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
}
TYPED_TEST(TestGemmABScale_KM_KN, MidLargeM)
{
std::vector<int> Ms{128, 256};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideB = N;
constexpr int StrideC = N;
for(int M : Ms)
{
int StrideA = M;
this->Run(M, N, K, StrideA, StrideB, StrideC);
}
}
TYPED_TEST(TestGemmABScale_KM_KN, Regular)
{
std::vector<int> Ms{512};
constexpr int N = 512;
constexpr int K = 1024;
constexpr int StrideB = N;
constexpr int StrideE = N;
for(int M : Ms)
{
int StrideA = M;
this->Run(M, N, K, StrideA, StrideB, StrideE);
}
}

View File

@@ -0,0 +1,102 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <string>
#include <sstream>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/data_type.hpp"
#include "profiler/profile_gemm_ab_scale_impl.hpp"
namespace ck {
namespace test {
template <typename Tuple>
class TestGemmABScale : public testing::Test
{
using F32 = float;
protected:
using ALayout = std::tuple_element_t<0, Tuple>;
using BLayout = std::tuple_element_t<1, Tuple>;
using ELayout = std::tuple_element_t<2, Tuple>;
using A0DataType = std::tuple_element_t<3, Tuple>;
using A1DataType = std::tuple_element_t<4, Tuple>;
using B0DataType = std::tuple_element_t<5, Tuple>;
using B1DataType = std::tuple_element_t<6, Tuple>;
using ComputeDataType = std::tuple_element_t<7, Tuple>;
using EDataType = std::tuple_element_t<8, Tuple>;
public:
static constexpr ck::index_t ScaleBlockM = 1;
static constexpr ck::index_t ScaleBlockN = 128;
static constexpr ck::index_t ScaleBlockK = 128;
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1;
static constexpr bool log_ = false;
static constexpr bool bench_ = false;
std::vector<int> k_batches_;
void SetUp() override { k_batches_ = {1, 2}; }
void Run(const int M,
const int N,
const int K,
const int StrideA,
const int StrideB,
const int StrideE)
{
for(auto kb : k_batches_)
{
RunSingle(M, N, K, StrideA, StrideB, StrideE, kb);
}
}
void RunSingle(const int M,
const int N,
const int K,
const int StrideA,
const int StrideB,
const int StrideE,
int kbatch = 1,
int n_warmup = 1,
int n_iter = 10)
{
bool pass = ck::profiler::profile_gemm_ab_scale_impl<A0DataType,
A1DataType,
B0DataType,
B1DataType,
ComputeDataType,
F32,
EDataType,
ScaleBlockM,
ScaleBlockN,
ScaleBlockK,
ALayout,
BLayout,
ELayout>(verify_,
init_method_,
log_,
bench_,
M,
N,
K,
StrideA,
StrideB,
StrideE,
kbatch,
n_warmup,
n_iter);
EXPECT_TRUE(pass);
}
};
} // namespace test
} // namespace ck

View File

@@ -2,8 +2,8 @@
# SPDX-License-Identifier: MIT
if(GPU_TARGETS MATCHES "gfx9[45]|gfx12")
add_gtest_executable(test_gemm_blockscale_wp_xdl_fp8 test_gemm_blockscale_wp_xdl_fp8.cpp)
add_gtest_executable(test_gemm_blockscale_wp_fp8 test_gemm_blockscale_wp_fp8.cpp)
if(result EQUAL 0)
target_link_libraries(test_gemm_blockscale_wp_xdl_fp8 PRIVATE utility device_gemm_blockscale_wp_instance)
target_link_libraries(test_gemm_blockscale_wp_fp8 PRIVATE utility device_gemm_blockscale_wp_instance)
endif()
endif()