Files
composable_kernel/test/mx_wmma_op/mx_wmma_op_unscaled.cpp
Illia Silin 717f2efef7 [rocm-libraries] ROCm/rocm-libraries#6978 (commit e58096d)
[CK] add composable kernel support on gfx1250 (#6978)

## Motivation

Add composable kernel support on gfx1250.

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

---------

Co-authored-by: Qun Lin <qlin@amd.com>
Co-authored-by: jialuo12_amdeng <jia.luo@amd.com>
Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
Co-authored-by: hsivasun_amdeng <haresh.sivasuntharampillai@amd.com>
2026-05-15 06:46:51 -07:00

341 lines
12 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "mx_wmma_op_test_common.hpp"
// Unscaled WMMA: test wmma_f16_16x16x128_bf8_bf8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x128_BF8_BF8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::bf8_t;
using BType = ck::bf8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f16_16x16x128_bf8_fp8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x128_BF8_FP8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::bf8_t;
using BType = ck::f8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f16_16x16x128_fp8_bf8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x128_FP8_BF8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::f8_t;
using BType = ck::bf8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f16_16x16x128_fp8_fp8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x128_FP8_FP8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::f8_t;
using BType = ck::f8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f16_16x16x64_f8f8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x64_FP8_FP8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::f8_t;
using BType = ck::f8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 64;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f16_16x16x64_f8bf8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x64_FP8_BF8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::f8_t;
using BType = ck::bf8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 64;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f16_16x16x64_bf8f8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x64_BF8_FP8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::bf8_t;
using BType = ck::f8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 64;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f16_16x16x64_bf8bf8_gfx125
TEST(MXWMMA_UNSCALED, MXF16WMMA16x16x64_BF8_BF8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::bf8_t;
using BType = ck::bf8_t;
using CType = ck::half_t;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 64;
using AccType = ck::half_t;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f32_16x16x128_bf8_bf8_gfx125
TEST(MXWMMA_UNSCALED, MXF32WMMA16x16x128_BF8_BF8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::bf8_t;
using BType = ck::bf8_t;
using CType = float;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = float;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f32_16x16x128_bf8_fp8_gfx125
TEST(MXWMMA_UNSCALED, MXF32WMMA16x16x128_BF8_FP8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::bf8_t;
using BType = ck::f8_t;
using CType = float;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = float;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f32_16x16x128_fp8_bf8_gfx125
TEST(MXWMMA_UNSCALED, MXF32WMMA16x16x128_FP8_BF8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::f8_t;
using BType = ck::bf8_t;
using CType = float;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = float;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}
// Unscaled WMMA: test wmma_f32_16x16x128_fp8_fp8_gfx125
TEST(MXWMMA_UNSCALED, MXF32WMMA16x16x128_FP8_FP8_GFX125)
{
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using AType = ck::f8_t;
using BType = ck::f8_t;
using CType = float;
constexpr int BLOCK_M = 16;
constexpr int BLOCK_N = 16;
constexpr int BLOCK_K = 128;
using AccType = float;
auto pass = run_mx_wmma_unscaled_test<ALayout,
BLayout,
CLayout,
AType,
BType,
CType,
AccType,
BLOCK_M,
BLOCK_N,
BLOCK_K>(common_init);
EXPECT_TRUE(pass);
}