mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-22 16:04:32 +00:00
[CK] add composable kernel support on gfx1250 (#6978) ## Motivation Add composable kernel support on gfx1250. ## Technical Details <!-- Explain the changes along with any relevant GitHub links. --> ## Test Plan <!-- Explain any relevant testing done to verify this PR. --> ## Test Result <!-- Briefly summarize test outcomes. --> ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: Qun Lin <qlin@amd.com> Co-authored-by: jialuo12_amdeng <jia.luo@amd.com> Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com> Co-authored-by: hsivasun_amdeng <haresh.sivasuntharampillai@amd.com>
195 lines
7.1 KiB
C++
195 lines
7.1 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#include "mx_wmma_op_test_common.hpp"
|
|
|
|
// test FP4@FP4 with e8m0 scale and 32 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_E8M0)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e8m0_bexp_t,
|
|
e8m0_bexp_t,
|
|
ck::WMMA_SCALE::SCALE_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e8m0 scale and 16 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_SCALE16_E8M0)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e8m0_bexp_t,
|
|
e8m0_bexp_t,
|
|
ck::WMMA_SCALE::SCALE16_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e4m3 scale and 32 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_E4M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e4m3_scale_t,
|
|
e4m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e4m3 scale and 16 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_SCALE16_E4M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e4m3_scale_t,
|
|
e4m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE16_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e5m3 scale and 32 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_E5M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e5m3_scale_t,
|
|
e5m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e5m3 scale and 16 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_SCALE16_E5M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e5m3_scale_t,
|
|
e5m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE16_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e4m3 and e5m3 scales and 32 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_E4M3_E5M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e4m3_scale_t,
|
|
e5m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e4m3 and e5m3 scales and 16 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_SCALE16_E4M3_E5M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e4m3_scale_t,
|
|
e5m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE16_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e5m3 and e4m3 scales and 32 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_E5M3_E4M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e5m3_scale_t,
|
|
e4m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
|
|
// test FP4@FP4 with e5m3 and e4m3 scales and 16 block size
|
|
TEST(MXWMMA, MXFP4WMMA16x16x128_SCALE16_E5M3_E4M3)
|
|
{
|
|
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
|
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
|
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
|
|
|
auto pass = run_mx_wmma_test<ALayout,
|
|
BLayout,
|
|
CLayout,
|
|
f4_t,
|
|
f4_t,
|
|
float,
|
|
e5m3_scale_t,
|
|
e4m3_scale_t,
|
|
ck::WMMA_SCALE::SCALE16_F32_16x16x128>(common_init);
|
|
EXPECT_TRUE(pass);
|
|
}
|