Implement batched gemm add relu gemm add for rdna4 (#3391)

* wip: test suite for batched gemm multiple d gemm multiple d, working on gridwise implenentation

* wip: many fixes in implementation of batched gemm gemm multiple d

* wip: batched gemm gemm multiple d gridwise op compiling, not working yet

* fix: incorrect d0 grid indexing in batched gemm gemm multipled

* feat: add instances for batched gemm add relu gemm add

* chore: configure instance with low vector transfer size for odd sizes

* chore: add some more validation to device batched gemm gemm multiple d, and removed template parameter that didn't really make sense

* fix: upate device_batched_gemm_gemm_wmma to work with new gridwise changes

* fix: disable odd size tests on XDL archs

* chore: removed temporary logging

* chore: update some references to C tensor to E tensor

* Tentative fix for example template params

* Tentative fix for non-multi-D batched gemm gemm device impl.

* Tentative fix for xdl example template params

* Tentative fix for profiler build on gfx90a

* chore: improve device batched gemm gemm multi D comment to include all ops and dimensions

* chore: explicitly call ck::make_tuple to prevent issues when std::make_tuple would apply

* fix: make the gemm1 data types match what happens in the device op

* feat: add d0s/d1s datatypes and layouts to the device op type string

* chore: change element-wise op so addition happens in fp32

* chore: add static asserts for gemm0/gemm1 calculated wave sizes

* chore: also updated other element-wise ops to use fp32 calculations

* chore: log number of supported instances

* chore: update instance comment

* chore: disable kernel timing in example by default

* fix: gemm1 wave size calculation

* fix: make sure batched gemm multiple d gemm multiple d profiler performs correct type conversions

* chore: remove increased tolerance in batched gemm gemm multiple d example

* chore: add comment explaining that verification fails for certain input values

* chore: clarify instance comment

---------

Co-authored-by: kiefer <kiefer.van.teutem@streamhpc.com>

[ROCm/composable_kernel commit: d5ae81b292]
This commit is contained in:
Erwin Terpstra
2026-01-20 22:06:59 +01:00
committed by GitHub
parent 8b842250da
commit b079841b10
22 changed files with 2956 additions and 499 deletions

View File

@@ -20,6 +20,52 @@ namespace tensor_operation {
namespace device {
namespace instance {
#ifdef CK_USE_WMMA
#ifdef CK_ENABLE_FP16
void add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
Col,
ck::Tuple<Row>,
Row,
ck::Tuple<Row>,
Row,
F16,
F16,
ck::Tuple<F16>,
F16,
ck::Tuple<F16>,
F16,
PassThrough,
PassThrough,
CDE0ElementOp,
PassThrough,
CDE1ElementOp>>>&
instances);
void add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
Col,
ck::Tuple<Row>,
Col,
ck::Tuple<Row>,
Row,
F16,
F16,
ck::Tuple<F16>,
F16,
ck::Tuple<F16>,
F16,
PassThrough,
PassThrough,
CDE0ElementOp,
PassThrough,
CDE1ElementOp>>>&
instances);
#endif // CK_ENABLE_FP16
#endif // CK_USE_WMMA
#ifdef CK_USE_XDL
#ifdef CK_ENABLE_FP16
void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
Col,
@@ -59,7 +105,8 @@ void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_
PassThrough,
CDE1ElementOp>>>&
instances);
#endif // CK_ENABLE_FP16
#endif // CK_USE_XDL
template <typename A0Layout,
typename B0Layout,
typename D0sLayout,
@@ -113,22 +160,36 @@ struct DeviceOperationInstanceFactory<
{
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
#ifdef CK_ENABLE_FP16
if constexpr(is_same_v<A0DataType, half_t> && is_same_v<B0DataType, half_t> &&
is_same_v<B1DataType, half_t> && is_same_v<E1DataType, half_t>)
{
if constexpr(is_same_v<A0Layout, Row> && is_same_v<B0Layout, Col> &&
is_same_v<B1Layout, Row> && is_same_v<E1Layout, Row>)
{
#ifdef CK_USE_XDL
add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
op_ptrs);
#endif
#ifdef CK_USE_WMMA
add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
op_ptrs);
#endif
}
else if constexpr(is_same_v<A0Layout, Row> && is_same_v<B0Layout, Col> &&
is_same_v<B1Layout, Col> && is_same_v<E1Layout, Row>)
{
#ifdef CK_USE_XDL
add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
op_ptrs);
#endif
#ifdef CK_USE_WMMA
add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
op_ptrs);
#endif
}
}
#endif
return op_ptrs;
}
};