[CK][Examples] Extending support for rdna3/4 part 3:

-example_gemm_xdl_int8
-example_gemm_xdl_fp8
-example_gemm_xdl_fp8_bf8
-example_gemm_xdl_fp16_fp8
-example_gemm_add_add_fastgelu_xdl_int8
-example_grouped_gemm_xdl_int8
-example_grouped_conv_bwd_weight_xdl_bf16
-example_cgemm_xdl_fp32
-example_cgemm_xdl_int8

fixing cmdlines for:
-example_22_cgemm
-example_24_batched_gemm
-example_batched_gemm_xdl_fp16int4_b_scale_v3

Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>
This commit is contained in:
Michal Kulikowski
2025-10-03 15:30:40 +02:00
committed by Michał Kulikowski
parent 7259b9c4db
commit 2444c44895
13 changed files with 138 additions and 94 deletions

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
@@ -48,10 +48,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
16, // index_t KPerBlock
4, // index_t AK1
4, // index_t BK1
32, // index_t MPerXDL
32, // index_t NPerXDL
4, // index_t MXdlPerWave
2, // index_t NXdlPerWave
16, // index_t MPerXDL
16, // index_t NPerXDL
8, // index_t MXdlPerWave
4, // index_t NXdlPerWave
S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder
@@ -69,11 +69,16 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
1, // index_t CShuffleMXdlPerWavePerShuffle
1, // index_t CShuffleNXdlPerWavePerShuffle
S<1, 16, 1, 16>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
4>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
2>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
// clang-format on
int main(int argc, char* argv[])
{
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
{
return 0;
}
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
@@ -87,25 +92,25 @@ int main(int argc, char* argv[])
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
if(argc == 4)
if(argc == 1)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
// use default case
}
else if(argc == 10)
else if(argc == 4 || argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
if(argc == 10)
{
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]);
}
}
else
{
@@ -114,7 +119,7 @@ int main(int argc, char* argv[])
<< "arg3: run kernel # of times (>1)\n"
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
<< std::endl;
exit(0);
exit(1);
}
return !run_cgemm_xdl<ADataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
@@ -48,10 +48,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
64, // index_t KPerBlock
16, // index_t AK1
16, // index_t BK1
32, // index_t MPerXDL
32, // index_t NPerXDL
4, // index_t MXdlPerWave
2, // index_t NXdlPerWave
16, // index_t MPerXDL
16, // index_t NPerXDL
8, // index_t MXdlPerWave
4, // index_t NXdlPerWave
S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder
@@ -68,8 +68,8 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
1, // index_t BBlockLdsExtraN
1, // index_t CShuffleMXdlPerWavePerShuffle
1, // index_t CShuffleNXdlPerWavePerShuffle
S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
4>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
// clang-format on
int main(int argc, char* argv[])
@@ -87,25 +87,25 @@ int main(int argc, char* argv[])
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
if(argc == 4)
if(argc == 1)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
// use default case
}
else if(argc == 10)
else if(argc == 4 || argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
if(argc == 10)
{
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]);
}
}
else
{
@@ -114,7 +114,7 @@ int main(int argc, char* argv[])
<< "arg3: run kernel # of times (>1)\n"
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
<< std::endl;
exit(0);
exit(1);
}
return !run_cgemm_xdl<ADataType,