Implement device_gemm_universal_preshuffle_instance for RDNA4 (#3429)

* add device_gemm_wmma_cshuffle_v3_b_preshuffle.hpp

* add examples

* add instances to test

* remove duplicate code between examples

[ROCm/composable_kernel commit: 6df2d70143]
This commit is contained in:
Yung-sheng Tu
2026-01-15 16:19:31 +01:00
committed by GitHub
parent 09d084bfb4
commit 97f2fa2912
20 changed files with 1229 additions and 14 deletions

View File

@@ -3,18 +3,19 @@
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#ifdef CK_USE_XDL
#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/utility/amd_ck_fp8.hpp"
#include "ck/utility/data_type.hpp"
#if defined(CK_USE_XDL) || defined(CK_USE_WMMA)
#include "gemm_universal_preshuffle.inc"
#endif
#include <memory>
#include <vector>
namespace ck {
namespace tensor_operation {
namespace device {
@@ -51,7 +52,7 @@ struct DeviceOperationInstanceFactory<
static auto GetInstances()
{
#ifdef CK_USE_XDL
#if defined(CK_USE_XDL) || defined(CK_USE_WMMA)
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
@@ -60,6 +61,7 @@ struct DeviceOperationInstanceFactory<
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
is_same_v<CLayout, Row>)
{
#ifdef CK_USE_XDL
add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
op_ptrs);
add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances(
@@ -90,6 +92,17 @@ struct DeviceOperationInstanceFactory<
op_ptrs);
add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part1(
op_ptrs);
#endif
#ifdef CK_USE_WMMA
add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p1(
op_ptrs);
add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p2(
op_ptrs);
add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p3(
op_ptrs);
add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p4(
op_ptrs);
#endif
}
}
#endif
@@ -100,6 +113,7 @@ struct DeviceOperationInstanceFactory<
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
is_same_v<CLayout, Row>)
{
#ifdef CK_USE_XDL
add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
op_ptrs);
add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
@@ -136,10 +150,21 @@ struct DeviceOperationInstanceFactory<
op_ptrs);
add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
op_ptrs);
#endif
#ifdef CK_USE_WMMA
add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p1(
op_ptrs);
add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p2(
op_ptrs);
add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p3(
op_ptrs);
add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p4(
op_ptrs);
#endif
}
}
#endif
#endif // CK_USE_XDL
#endif // CK_USE_XDL || CK_USE_WMMA
return op_ptrs;
}

View File

@@ -13,8 +13,7 @@ namespace instance {
using GemmF8F8BF16InstanceVector = std::vector<std::unique_ptr<
DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&;
using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
#ifdef CK_USE_XDL
void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
GemmF8F8BF16InstanceVector& instances);
@@ -61,7 +60,32 @@ void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp
GemmF8F8BF16InstanceVector& instances);
#endif
#ifdef CK_USE_WMMA
void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p1(
GemmF8F8BF16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p2(
GemmF8F8BF16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p3(
GemmF8F8BF16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p4(
GemmF8F8BF16InstanceVector& instances);
#endif
#endif
#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
#ifdef CK_USE_XDL
void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
@@ -99,6 +123,25 @@ void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
GemmF8F8F16InstanceVector& instances);
#endif
#ifdef CK_USE_WMMA
void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p1(
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p2(
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p3(
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p4(
GemmF8F8F16InstanceVector& instances);
#endif
#endif
} // namespace instance
} // namespace device