[CK_BUILDER] Instance traits for conv bwd weight algorithms (#3498)

Added instance traits for the following bwd weight conv algorithms

DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
DeviceGroupedConvBwdWeight_Wmma_CShuffleV3
DeviceGroupedConvBwdWeight_Wmma_CShuffle
DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle
DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3
DeviceGroupedConvBwdWeight_DL
DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3
Added also unit tests for instance traits of those bwd weigth algorithms that are currently exposed by the narrow CK build for MIOpen.
---------

Co-authored-by: Ville Pietilä <>
This commit is contained in:
Ville Pietilä
2025-12-31 15:41:15 -08:00
committed by GitHub
parent f3e4d46faa
commit 6e8c401e33
25 changed files with 3206 additions and 2 deletions

View File

@@ -0,0 +1,272 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t M1PerThread,
ck::index_t N1PerThread,
ck::index_t KPerThread,
typename M1N1ThreadClusterM1Xs,
typename M1N1ThreadClusterN1Xs,
typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
typename ABlockTransferSrcVectorTensorContiguousDimOrder,
typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
typename BBlockTransferSrcVectorTensorContiguousDimOrder,
typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
typename CThreadTransferSrcDstAccessOrder,
ck::index_t CThreadTransferSrcDstVectorDim,
ck::index_t CThreadTransferDstScalarPerVector>
struct DeviceGroupedConvBwdWeight_Dl;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t M1PerThread,
ck::index_t N1PerThread,
ck::index_t KPerThread,
typename M1N1ThreadClusterM1Xs_,
typename M1N1ThreadClusterN1Xs_,
typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1_,
typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1_,
typename ABlockTransferSrcVectorTensorContiguousDimOrder_,
typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1_,
typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1_,
typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1_,
typename BBlockTransferSrcVectorTensorContiguousDimOrder_,
typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1_,
typename CThreadTransferSrcDstAccessOrder_,
ck::index_t CThreadTransferSrcDstVectorDim,
ck::index_t CThreadTransferDstScalarPerVector>
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Dl<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
K0PerBlock,
K1,
M1PerThread,
N1PerThread,
KPerThread,
M1N1ThreadClusterM1Xs_,
M1N1ThreadClusterN1Xs_,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1_,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1_,
ABlockTransferSrcVectorTensorContiguousDimOrder_,
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1_,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1_,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1_,
BBlockTransferSrcVectorTensorContiguousDimOrder_,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1_,
CThreadTransferSrcDstAccessOrder_,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Dl";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
static constexpr ck::index_t kK1 = K1;
static constexpr ck::index_t kM1PerThread = M1PerThread;
static constexpr ck::index_t kN1PerThread = N1PerThread;
static constexpr ck::index_t kKPerThread = KPerThread;
using M1N1ThreadClusterM1Xs = M1N1ThreadClusterM1Xs_;
using M1N1ThreadClusterN1Xs = M1N1ThreadClusterN1Xs_;
using ABlockTransferThreadSliceLengths_K0_M0_M1_K1 =
ABlockTransferThreadSliceLengths_K0_M0_M1_K1_;
using ABlockTransferThreadClusterLengths_K0_M0_M1_K1 =
ABlockTransferThreadClusterLengths_K0_M0_M1_K1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
using ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 =
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1_;
using ABlockTransferSrcVectorTensorContiguousDimOrder =
ABlockTransferSrcVectorTensorContiguousDimOrder_;
using ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 =
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1_;
using BBlockTransferThreadSliceLengths_K0_N0_N1_K1 =
BBlockTransferThreadSliceLengths_K0_N0_N1_K1_;
using BBlockTransferThreadClusterLengths_K0_N0_N1_K1 =
BBlockTransferThreadClusterLengths_K0_N0_N1_K1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
using BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 =
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1_;
using BBlockTransferSrcVectorTensorContiguousDimOrder =
BBlockTransferSrcVectorTensorContiguousDimOrder_;
using BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 =
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1_;
using CThreadTransferSrcDstAccessOrder = CThreadTransferSrcDstAccessOrder_;
static constexpr ck::index_t kCThreadTransferSrcDstVectorDim = CThreadTransferSrcDstVectorDim;
static constexpr ck::index_t kCThreadTransferDstScalarPerVector =
CThreadTransferDstScalarPerVector;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeight_Dl";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 13. BlockSize
oss << "," << kMPerBlock; // 14. MPerBlock
oss << "," << kNPerBlock; // 15. NPerBlock
oss << "," << kK0PerBlock; // 16. K0PerBlock
oss << "," << kK1; // 17. K1
oss << "," << kM1PerThread; // 18. M1PerThread
oss << "," << kN1PerThread; // 19. N1PerThread
oss << "," << kKPerThread; // 20. KPerThread
oss << "," << detail::sequence_name<M1N1ThreadClusterM1Xs>(); // 21.
oss << "," << detail::sequence_name<M1N1ThreadClusterN1Xs>(); // 22.
oss << "," << detail::sequence_name<ABlockTransferThreadSliceLengths_K0_M0_M1_K1>(); // 23.
oss << ","
<< detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M0_M1_K1>(); // 24.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 25.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 26.
oss << ","
<< detail::sequence_name<ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1>(); // 27.
oss << ","
<< detail::sequence_name<ABlockTransferSrcVectorTensorContiguousDimOrder>(); // 28.
oss << ","
<< detail::sequence_name<ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1>(); // 29.
oss << "," << detail::sequence_name<BBlockTransferThreadSliceLengths_K0_N0_N1_K1>(); // 30.
oss << ","
<< detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N0_N1_K1>(); // 31.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 32.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 33.
oss << ","
<< detail::sequence_name<BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1>(); // 34.
oss << ","
<< detail::sequence_name<BBlockTransferSrcVectorTensorContiguousDimOrder>(); // 35.
oss << ","
<< detail::sequence_name<BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1>(); // 36.
oss << "," << detail::sequence_name<CThreadTransferSrcDstAccessOrder>(); // 37.
oss << "," << kCThreadTransferSrcDstVectorDim; // 38.
oss << "," << kCThreadTransferDstScalarPerVector; // 39.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -0,0 +1,295 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename DsLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename DsDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t ABK1,
ck::index_t MPerWmma,
ck::index_t NPerWmma,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_AK1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_BK1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
typename ComputeTypeA,
typename ComputeTypeB>
struct DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename DsLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename DsDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t ABK1,
ck::index_t MPerWmma,
ck::index_t NPerWmma,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_AK0_M_AK1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_AK1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_BK0_N_BK1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_BK1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
typename ComputeTypeA_,
typename ComputeTypeB_>
struct InstanceTraits<
ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
DsLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
DsDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
KPerBlock,
ABK1,
MPerWmma,
NPerWmma,
MRepeat,
NRepeat,
ABlockTransferThreadClusterLengths_AK0_M_AK1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_AK1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_BK0_N_BK1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_BK1,
BBlockLdsAddExtraN,
CShuffleMRepeatPerShuffle,
CShuffleNRepeatPerShuffle,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CShuffleBlockTransferScalarPerVector_NPerBlock,
BlkGemmPipeSched,
BlkGemmPipelineVer,
ComputeTypeA_,
ComputeTypeB_>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using DsLayout = DsLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using DsDataType = DsDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kKPerBlock = KPerBlock;
static constexpr ck::index_t kABK1 = ABK1;
static constexpr ck::index_t kMPerWmma = MPerWmma;
static constexpr ck::index_t kNPerWmma = NPerWmma;
static constexpr ck::index_t kMRepeat = MRepeat;
static constexpr ck::index_t kNRepeat = NRepeat;
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
CShuffleBlockTransferScalarPerVector_NPerBlock;
using ABlockTransferThreadClusterLengths_AK0_M_AK1 =
ABlockTransferThreadClusterLengths_AK0_M_AK1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_AK1 =
ABlockTransferDstScalarPerVector_AK1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_BK0_N_BK1 =
BBlockTransferThreadClusterLengths_BK0_N_BK1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_BK1 =
BBlockTransferDstScalarPerVector_BK1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
using ComputeTypeA = ComputeTypeA_;
using ComputeTypeB = ComputeTypeB_;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::tuple_name<DsLayout>(); // 5. DsLayout
oss << "," << detail::type_name<InDataType>(); // 6. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 7. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 8. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 9. AccDataType
oss << "," << detail::tuple_name<DsDataType>(); // 10. DsDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 11. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 12.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 13.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 14. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 15. BlockSize
oss << "," << kMPerBlock; // 16. MPerBlock
oss << "," << kNPerBlock; // 17. NPerBlock
oss << "," << kKPerBlock; // 18. KPerBlock
oss << "," << kABK1; // 19. ABK1
oss << "," << kMPerWmma; // 20. MPerWmma
oss << "," << kNPerWmma; // 21. NPerWmma
oss << "," << kMRepeat; // 22. MRepeat
oss << "," << kNRepeat; // 23. NRepeat
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_AK0_M_AK1>(); // 24.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 25.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 26.
oss << "," << kABlockTransferSrcVectorDim; // 27.
oss << "," << kABlockTransferSrcScalarPerVector; // 28.
oss << "," << kABlockTransferDstScalarPerVector_AK1; // 29.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 30.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_BK0_N_BK1>(); // 31.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 32.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 33.
oss << "," << kBBlockTransferSrcVectorDim; // 34.
oss << "," << kBBlockTransferSrcScalarPerVector; // 35.
oss << "," << kBBlockTransferDstScalarPerVector_BK1; // 36.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 37.
oss << "," << kCShuffleMRepeatPerShuffle; // 38.
oss << "," << kCShuffleNRepeatPerShuffle; // 39.
oss << ","
<< detail::sequence_name<
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 40.
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 41.
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 42.
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 43.
oss << "," << detail::type_name<ComputeTypeA>(); // 44.
oss << "," << detail::type_name<ComputeTypeB>(); // 45.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -0,0 +1,282 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename DsLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename DsDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
typename ComputeTypeA,
typename ComputeTypeB>
struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename DsLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename DsDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
typename ComputeTypeA_,
typename ComputeTypeB_>
struct InstanceTraits<
ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
DsLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
DsDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
K0PerBlock,
K1,
MPerXDL,
NPerXDL,
MXdlPerWave,
NXdlPerWave,
ABlockTransferThreadClusterLengths_K0_M_K1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_K0_N_K1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BBlockLdsAddExtraN,
CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CBlockTransferScalarPerVector_NWaveNPerXdl,
ComputeTypeA_,
ComputeTypeB_>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using DsLayout = DsLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using DsDataType = DsDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
static constexpr ck::index_t kK1 = K1;
static constexpr ck::index_t kMPerXDL = MPerXDL;
static constexpr ck::index_t kNPerXDL = NPerXDL;
static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
CBlockTransferScalarPerVector_NWaveNPerXdl;
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
ABlockTransferDstScalarPerVector_K1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
BBlockTransferDstScalarPerVector_K1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
using ComputeTypeA = ComputeTypeA_;
using ComputeTypeB = ComputeTypeB_;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::tuple_name<DsLayout>(); // 5. DsLayout
oss << "," << detail::type_name<InDataType>(); // 6. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 7. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 8. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 9. AccDataType
oss << "," << detail::tuple_name<DsDataType>(); // 10. DsDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 11. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 12.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 13.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 14. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 15. BlockSize
oss << "," << kMPerBlock; // 16. MPerBlock
oss << "," << kNPerBlock; // 17. NPerBlock
oss << "," << kK0PerBlock; // 18. K0PerBlock
oss << "," << kK1; // 19. K1
oss << "," << kMPerXDL; // 20. MPerXDL
oss << "," << kNPerXDL; // 21. NPerXDL
oss << "," << kMXdlPerWave; // 22. MXdlPerWave
oss << "," << kNXdlPerWave; // 23. NXdlPerWave
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 24.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 25.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 26.
oss << "," << kABlockTransferSrcVectorDim; // 27.
oss << "," << kABlockTransferSrcScalarPerVector; // 28.
oss << "," << kABlockTransferDstScalarPerVector_K1; // 29.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 30.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 31.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 32.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 33.
oss << "," << kBBlockTransferSrcVectorDim; // 34.
oss << "," << kBBlockTransferSrcScalarPerVector; // 35.
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 36.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 37.
oss << "," << kCShuffleMXdlPerWavePerShuffle; // 38.
oss << "," << kCShuffleNXdlPerWavePerShuffle; // 39.
oss << ","
<< detail::sequence_name<
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 40.
oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 41.
oss << "," << detail::type_name<ComputeTypeA>(); // 42.
oss << "," << detail::type_name<ComputeTypeB>(); // 43.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -0,0 +1,302 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t ABK1,
ck::index_t MPerWmma,
ck::index_t NPerWmma,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_AK1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_BK1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
ck::index_t NumGroupsToMerge,
typename ComputeTypeA,
typename ComputeTypeB,
ck::index_t TransposeTransferSrcScalarPerVector,
ck::index_t TransposeTransferDstScalarPerVector>
struct DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t ABK1,
ck::index_t MPerWmma,
ck::index_t NPerWmma,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_AK0_M_AK1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_AK1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_BK0_N_BK1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_BK1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
ck::index_t NumGroupsToMerge,
typename ComputeTypeA_,
typename ComputeTypeB_,
ck::index_t TransposeTransferSrcScalarPerVector,
ck::index_t TransposeTransferDstScalarPerVector>
struct InstanceTraits<
ck::tensor_operation::device::DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
KPerBlock,
ABK1,
MPerWmma,
NPerWmma,
MRepeat,
NRepeat,
ABlockTransferThreadClusterLengths_AK0_M_AK1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_AK1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_BK0_N_BK1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_BK1,
BBlockLdsAddExtraN,
CShuffleMRepeatPerShuffle,
CShuffleNRepeatPerShuffle,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CShuffleBlockTransferScalarPerVector_NPerBlock,
BlkGemmPipeSched,
BlkGemmPipelineVer,
NumGroupsToMerge,
ComputeTypeA_,
ComputeTypeB_,
TransposeTransferSrcScalarPerVector,
TransposeTransferDstScalarPerVector>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kKPerBlock = KPerBlock;
static constexpr ck::index_t kABK1 = ABK1;
static constexpr ck::index_t kMPerWmma = MPerWmma;
static constexpr ck::index_t kNPerWmma = NPerWmma;
static constexpr ck::index_t kMRepeat = MRepeat;
static constexpr ck::index_t kNRepeat = NRepeat;
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
CShuffleBlockTransferScalarPerVector_NPerBlock;
static constexpr ck::index_t kNumGroupsToMerge = NumGroupsToMerge;
static constexpr ck::index_t kTransposeTransferSrcScalarPerVector =
TransposeTransferSrcScalarPerVector;
static constexpr ck::index_t kTransposeTransferDstScalarPerVector =
TransposeTransferDstScalarPerVector;
using ABlockTransferThreadClusterLengths_AK0_M_AK1 =
ABlockTransferThreadClusterLengths_AK0_M_AK1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_AK1 =
ABlockTransferDstScalarPerVector_AK1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_BK0_N_BK1 =
BBlockTransferThreadClusterLengths_BK0_N_BK1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_BK1 =
BBlockTransferDstScalarPerVector_BK1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
using ComputeTypeA = ComputeTypeA_;
using ComputeTypeB = ComputeTypeB_;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 13. BlockSize
oss << "," << kMPerBlock; // 14. MPerBlock
oss << "," << kNPerBlock; // 15. NPerBlock
oss << "," << kKPerBlock; // 16. KPerBlock
oss << "," << kABK1; // 17. ABK1
oss << "," << kMPerWmma; // 18. MPerWmma
oss << "," << kNPerWmma; // 19. NPerWmma
oss << "," << kMRepeat; // 20. MRepeat
oss << "," << kNRepeat; // 21. NRepeat
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_AK0_M_AK1>(); // 22.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
oss << "," << kABlockTransferSrcVectorDim; // 25.
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
oss << "," << kABlockTransferDstScalarPerVector_AK1; // 27.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_BK0_N_BK1>(); // 29.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
oss << "," << kBBlockTransferSrcVectorDim; // 32.
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
oss << "," << kBBlockTransferDstScalarPerVector_BK1; // 34.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
oss << "," << kCShuffleMRepeatPerShuffle; // 36.
oss << "," << kCShuffleNRepeatPerShuffle; // 37.
oss << ","
<< detail::sequence_name<
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 39.
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
oss << "," << kNumGroupsToMerge; // 42.
oss << "," << detail::type_name<ComputeTypeA>(); // 43.
oss << "," << detail::type_name<ComputeTypeB>(); // 44.
oss << "," << kTransposeTransferSrcScalarPerVector; // 45.
oss << "," << kTransposeTransferDstScalarPerVector; // 46.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -0,0 +1,299 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
ck::index_t NumGroupsToMerge,
typename ComputeTypeA,
typename ComputeTypeB,
ck::index_t TransposeTransferSrcScalarPerVector,
ck::index_t TransposeTransferDstScalarPerVector>
struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
ck::index_t NumGroupsToMerge,
typename ComputeTypeA_,
typename ComputeTypeB_,
ck::index_t TransposeTransferSrcScalarPerVector,
ck::index_t TransposeTransferDstScalarPerVector>
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
KPerBlock,
K1,
MPerXDL,
NPerXDL,
MXdlPerWave,
NXdlPerWave,
ABlockTransferThreadClusterLengths_K0_M_K1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_K0_N_K1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BBlockLdsAddExtraN,
CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CBlockTransferScalarPerVector_NWaveNPerXdl,
BlkGemmPipeSched,
BlkGemmPipelineVer,
NumGroupsToMerge,
ComputeTypeA_,
ComputeTypeB_,
TransposeTransferSrcScalarPerVector,
TransposeTransferDstScalarPerVector>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kKPerBlock = KPerBlock;
static constexpr ck::index_t kK1 = K1;
static constexpr ck::index_t kMPerXDL = MPerXDL;
static constexpr ck::index_t kNPerXDL = NPerXDL;
static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
CBlockTransferScalarPerVector_NWaveNPerXdl;
static constexpr ck::index_t kNumGroupsToMerge = NumGroupsToMerge;
static constexpr ck::index_t kTransposeTransferSrcScalarPerVector =
TransposeTransferSrcScalarPerVector;
static constexpr ck::index_t kTransposeTransferDstScalarPerVector =
TransposeTransferDstScalarPerVector;
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
ABlockTransferDstScalarPerVector_K1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
BBlockTransferDstScalarPerVector_K1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
using ComputeTypeA = ComputeTypeA_;
using ComputeTypeB = ComputeTypeB_;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 13. BlockSize
oss << "," << kMPerBlock; // 14. MPerBlock
oss << "," << kNPerBlock; // 15. NPerBlock
oss << "," << kKPerBlock; // 16. KPerBlock
oss << "," << kK1; // 17. K1
oss << "," << kMPerXDL; // 18. MPerXDL
oss << "," << kNPerXDL; // 19. NPerXDL
oss << "," << kMXdlPerWave; // 20. MXdlPerWave
oss << "," << kNXdlPerWave; // 21. NXdlPerWave
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
oss << "," << kABlockTransferSrcVectorDim; // 25.
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
oss << "," << kABlockTransferDstScalarPerVector_K1; // 27.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
oss << "," << kBBlockTransferSrcVectorDim; // 32.
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
oss << "," << kCShuffleMXdlPerWavePerShuffle; // 36.
oss << "," << kCShuffleNXdlPerWavePerShuffle; // 37.
oss << ","
<< detail::sequence_name<
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 39.
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
oss << "," << kNumGroupsToMerge; // 42.
oss << "," << detail::type_name<ComputeTypeA>(); // 43.
oss << "," << detail::type_name<ComputeTypeB>(); // 44.
oss << "," << kTransposeTransferSrcScalarPerVector; // 45.
oss << "," << kTransposeTransferDstScalarPerVector; // 46.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -0,0 +1,277 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ConvolutionBackwardWeightSpecialization ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerWMMA,
ck::index_t NPerWMMA,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_K0_M_K1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::index_t NumGemmKPrefetchStage,
ck::LoopScheduler LoopSched,
ck::PipelineVersion PipelineVer,
typename ck::enable_if<NDimSpatial == 3, bool>::type>
struct DeviceGroupedConvBwdWeight_Wmma_CShuffle;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerWMMA,
ck::index_t NPerWMMA,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::index_t NumGemmKPrefetchStage,
ck::LoopScheduler LoopSched,
ck::PipelineVersion PipelineVer>
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Wmma_CShuffle<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
K0PerBlock,
K1,
MPerWMMA,
NPerWMMA,
MRepeat,
NRepeat,
ABlockTransferThreadClusterLengths_K0_M_K1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_K0_N_K1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BBlockLdsAddExtraN,
CShuffleMRepeatPerShuffle,
CShuffleNRepeatPerShuffle,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CShuffleBlockTransferScalarPerVector_NPerBlock,
NumGemmKPrefetchStage,
LoopSched,
PipelineVer,
false>> // Use false to match with the default value
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Wmma_CShuffle";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
static constexpr ck::index_t kK1 = K1;
static constexpr ck::index_t kMPerWMMA = MPerWMMA;
static constexpr ck::index_t kNPerWMMA = NPerWMMA;
static constexpr ck::index_t kMRepeat = MRepeat;
static constexpr ck::index_t kNRepeat = NRepeat;
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
CShuffleBlockTransferScalarPerVector_NPerBlock;
static constexpr ck::index_t kNumGemmKPrefetchStage = NumGemmKPrefetchStage;
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
ABlockTransferDstScalarPerVector_K1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
BBlockTransferDstScalarPerVector_K1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
static constexpr ck::LoopScheduler kLoopSched = LoopSched;
static constexpr ck::PipelineVersion kPipelineVer = PipelineVer;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeight_Wmma_CShuffle";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 13. BlockSize
oss << "," << kMPerBlock; // 14. MPerBlock
oss << "," << kNPerBlock; // 15. NPerBlock
oss << "," << kK0PerBlock; // 16. K0PerBlock
oss << "," << kK1; // 17. K1
oss << "," << kMPerWMMA; // 18. MPerWMMA
oss << "," << kNPerWMMA; // 19. NPerWMMA
oss << "," << kMRepeat; // 20. MRepeat
oss << "," << kNRepeat; // 21. NRepeat
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
oss << "," << kABlockTransferSrcVectorDim; // 25.
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
oss << "," << kABlockTransferDstScalarPerVector_K1; // 27.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
oss << "," << kBBlockTransferSrcVectorDim; // 32.
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
oss << "," << kCShuffleMRepeatPerShuffle; // 36.
oss << "," << kCShuffleNRepeatPerShuffle; // 37.
oss << ","
<< detail::sequence_name<
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 39.
oss << "," << kNumGemmKPrefetchStage; // 40.
oss << "," << detail::loop_scheduler_name(kLoopSched); // 41.
oss << "," << detail::pipeline_version_name(kPipelineVer); // 42.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -0,0 +1,296 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t ABK1,
ck::index_t MPerWmma,
ck::index_t NPerWmma,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_AK1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_BK1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
typename ComputeTypeA,
typename ComputeTypeB,
ck::index_t MaxTransposeTransferSrcScalarPerVector,
ck::index_t MaxTransposeTransferDstScalarPerVector>
struct DeviceGroupedConvBwdWeight_Wmma_CShuffleV3;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t KPerBlock,
ck::index_t ABK1,
ck::index_t MPerWmma,
ck::index_t NPerWmma,
ck::index_t MRepeat,
ck::index_t NRepeat,
typename ABlockTransferThreadClusterLengths_AK0_M_AK1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_AK1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_BK0_N_BK1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_BK1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMRepeatPerShuffle,
ck::index_t CShuffleNRepeatPerShuffle,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
typename ComputeTypeA_,
typename ComputeTypeB_,
ck::index_t MaxTransposeTransferSrcScalarPerVector,
ck::index_t MaxTransposeTransferDstScalarPerVector>
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Wmma_CShuffleV3<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
KPerBlock,
ABK1,
MPerWmma,
NPerWmma,
MRepeat,
NRepeat,
ABlockTransferThreadClusterLengths_AK0_M_AK1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_AK1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_BK0_N_BK1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_BK1,
BBlockLdsAddExtraN,
CShuffleMRepeatPerShuffle,
CShuffleNRepeatPerShuffle,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CShuffleBlockTransferScalarPerVector_NPerBlock,
BlkGemmPipeSched,
BlkGemmPipelineVer,
ComputeTypeA_,
ComputeTypeB_,
MaxTransposeTransferSrcScalarPerVector,
MaxTransposeTransferDstScalarPerVector>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kKPerBlock = KPerBlock;
static constexpr ck::index_t kABK1 = ABK1;
static constexpr ck::index_t kMPerWmma = MPerWmma;
static constexpr ck::index_t kNPerWmma = NPerWmma;
static constexpr ck::index_t kMRepeat = MRepeat;
static constexpr ck::index_t kNRepeat = NRepeat;
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
CShuffleBlockTransferScalarPerVector_NPerBlock;
static constexpr ck::index_t kMaxTransposeTransferSrcScalarPerVector =
MaxTransposeTransferSrcScalarPerVector;
static constexpr ck::index_t kMaxTransposeTransferDstScalarPerVector =
MaxTransposeTransferDstScalarPerVector;
using ABlockTransferThreadClusterLengths_AK0_M_AK1 =
ABlockTransferThreadClusterLengths_AK0_M_AK1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_AK1 =
ABlockTransferDstScalarPerVector_AK1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_BK0_N_BK1 =
BBlockTransferThreadClusterLengths_BK0_N_BK1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_BK1 =
BBlockTransferDstScalarPerVector_BK1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
using ComputeTypeA = ComputeTypeA_;
using ComputeTypeB = ComputeTypeB_;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 13. BlockSize
oss << "," << kMPerBlock; // 14. MPerBlock
oss << "," << kNPerBlock; // 15. NPerBlock
oss << "," << kKPerBlock; // 16. KPerBlock
oss << "," << kABK1; // 17. ABK1
oss << "," << kMPerWmma; // 18. MPerWmma
oss << "," << kNPerWmma; // 19. NPerWmma
oss << "," << kMRepeat; // 20. MRepeat
oss << "," << kNRepeat; // 21. NRepeat
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_AK0_M_AK1>(); // 22.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
oss << "," << kABlockTransferSrcVectorDim; // 25.
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
oss << "," << kABlockTransferDstScalarPerVector_AK1; // 27.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_BK0_N_BK1>(); // 29.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
oss << "," << kBBlockTransferSrcVectorDim; // 32.
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
oss << "," << kBBlockTransferDstScalarPerVector_BK1; // 34.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
oss << "," << kCShuffleMRepeatPerShuffle; // 36.
oss << "," << kCShuffleNRepeatPerShuffle; // 37.
oss << ","
<< detail::sequence_name<
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 39.
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
oss << "," << detail::type_name<ComputeTypeA>(); // 42.
oss << "," << detail::type_name<ComputeTypeB>(); // 43.
oss << "," << kMaxTransposeTransferSrcScalarPerVector; // 44.
oss << "," << kMaxTransposeTransferDstScalarPerVector; // 45.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -0,0 +1,284 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "instance_traits_util.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
// Forward declaration to avoid circular dependency
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
typename ComputeTypeA,
typename ComputeTypeB>
struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
typename ComputeTypeA_,
typename ComputeTypeB_>
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
K0PerBlock,
K1,
MPerXDL,
NPerXDL,
MXdlPerWave,
NXdlPerWave,
ABlockTransferThreadClusterLengths_K0_M_K1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_K0_N_K1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BBlockLdsAddExtraN,
CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CBlockTransferScalarPerVector_NWaveNPerXdl,
BlkGemmPipeSched,
BlkGemmPipelineVer,
ComputeTypeA_,
ComputeTypeB_>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
static constexpr ck::index_t kK1 = K1;
static constexpr ck::index_t kMPerXDL = MPerXDL;
static constexpr ck::index_t kNPerXDL = NPerXDL;
static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
ABlockTransferDstScalarPerVector_K1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
BBlockTransferDstScalarPerVector_K1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
CBlockTransferScalarPerVector_NWaveNPerXdl;
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
using ComputeTypeA = ComputeTypeA_;
using ComputeTypeB = ComputeTypeB_;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 13. BlockSize
oss << "," << kMPerBlock; // 14. MPerBlock
oss << "," << kNPerBlock; // 15. NPerBlock
oss << "," << kK0PerBlock; // 16. K0PerBlock
oss << "," << kK1; // 17. K1
oss << "," << kMPerXDL; // 18. MPerXDL
oss << "," << kNPerXDL; // 19. NPerXDL
oss << "," << kMXdlPerWave; // 20. MXdlPerWave
oss << "," << kNXdlPerWave; // 21. NXdlPerWave
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
oss << "," << kABlockTransferSrcVectorDim; // 25.
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
oss << "," << kABlockTransferDstScalarPerVector_K1; // 27.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
oss << "," << kBBlockTransferSrcVectorDim; // 32.
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
oss << "," << kCShuffleMXdlPerWavePerShuffle; // 36.
oss << "," << kCShuffleNXdlPerWavePerShuffle; // 37.
oss << ","
<< detail::sequence_name<
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 39.
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
oss << "," << detail::type_name<ComputeTypeA>(); // 42.
oss << "," << detail::type_name<ComputeTypeB>(); // 43.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -119,13 +119,29 @@ add_ck_builder_test(test_ckb_conv_builder
# - Group convolution (v3, standard, large tensor, WMMA, DL variants)
# - Backward weight group convolution (XDL)
# Requires kernel compilation to validate the generated strings through the base class.
add_ck_builder_test(test_ckb_instance_string
set(INSTANCE_STRING_TESTS
test_instance_string_fwd_grp_conv_v3.cpp
test_instance_string_fwd_grp_conv.cpp
test_instance_string_fwd_grp_conv_large_tensor.cpp
test_instance_string_fwd_grp_conv_wmma.cpp
test_instance_string_fwd_grp_conv_dl.cpp
test_instance_string_bwd_weight_grp_conv_xdl.cpp)
test_instance_string_bwd_weight_grp_conv_xdl.cpp
test_instance_string_bwd_weight_grp_conv_dl.cpp
test_instance_string_bwd_weight_grp_conv_multiple_d_xdl.cpp
test_instance_string_bwd_weight_grp_conv_two_stage_xdl.cpp
test_instance_string_bwd_weight_grp_conv_xdl_v3.cpp
)
if (CK_USE_WMMA)
list(APPEND INSTANCE_STRING_TESTS
test_instance_string_bwd_weight_grp_conv_wmma_v3.cpp
test_instance_string_bwd_weight_grp_conv_multiple_d_wmma_v3.cpp
test_instance_string_bwd_weight_grp_conv_two_stage_wmma_v3.cpp
)
endif()
add_ck_builder_test(test_ckb_instance_string ${INSTANCE_STRING_TESTS})
# Tests the forward convolution builder across multiple data types and dimensions.
# Individual tests are split into separate files to enable parallel compilation.

View File

@@ -0,0 +1,79 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
// Use the first instance from device_grouped_conv_bwd_weight_dl_f16_instances
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_bwd_weight_dl_f16_instances<
2, // NDimSpatial
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected string based on the generic instance
std::string expected_str = "DeviceGroupedConvBwdWeight_Dl"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",256" // BlockSize
",128" // MPerBlock
",128" // NPerBlock
",16" // K0PerBlock
",1" // K1
",4" // M1PerThread
",4" // N1PerThread
",1" // KPerThread
",Seq(8,2)" // M1N1ThreadClusterM1Xs
",Seq(8,2)" // M1N1ThreadClusterN1Xs
",Seq(1,8,1,1,1)" // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
",Seq(1,2,1,128,1)" // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
",Seq(0,2,3,1,4)" // ABlockTransferThreadClusterArrangeOrder
",Seq(0,2,3,1,4)" // ABlockTransferSrcAccessOrder
",Seq(1,1,1,1,1)" // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
",Seq(0,2,3,1,4)" // ABlockTransferSrcVectorTensorContiguousDimOrder
",Seq(1,1,1,1,1)" // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
",Seq(1,1,1,8,1)" // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
",Seq(1,16,1,16,1)" // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
",Seq(0,1,4,2,3)" // BBlockTransferThreadClusterArrangeOrder
",Seq(0,1,4,2,3)" // BBlockTransferSrcAccessOrder
",Seq(1,1,1,1,1)" // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
",Seq(0,1,4,2,3)" // BBlockTransferSrcVectorTensorContiguousDimOrder
",Seq(1,1,1,1,1)" // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
",Seq(0,1,2,3,4,5)" // CThreadTransferSrcDstAccessOrder
",5" // CThreadTransferSrcDstVectorDim
",1" // CThreadTransferDstScalarPerVector
">";
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvDl)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,86 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
// Use the first instance from device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances<
2, // NDimSpatial
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected string based on the generic instance
std::string expected_str =
"DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",EmptyTuple" // DsLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",EmptyTuple" // DsDataType
",PassThrough" // InElementwiseOperation
",Scale" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",32" // KPerBlock
",8" // ABK1
",16" // MPerWmma
",16" // NPerWmma
",4" // MRepeat
",2" // NRepeat
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",1" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",4" // ABlockTransferDstScalarPerVector_AK1
",true" // ABlockLdsAddExtraM
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",1" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",4" // BBlockTransferDstScalarPerVector_BK1
",true" // BBlockLdsAddExtraN
",1" // CShuffleMRepeatPerShuffle
",1" // CShuffleNRepeatPerShuffle
",Seq(1,16,1,4)" // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
",Intrawave" // BlkGemmPipeSched
",v1" // BlkGemmPipelineVer
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
">";
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvMultipleDWmmaV3)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,84 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
// Use the first instance from device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances<
2, // NDimSpatial
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected string based on the generic instance
std::string expected_str =
"DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",EmptyTuple" // DsLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",EmptyTuple" // DsDataType
",PassThrough" // InElementwiseOperation
",Scale" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",4" // K0PerBlock
",8" // K1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(1,4,8,2)" // ABlockTransferThreadClusterLengths_K0_M_K1
",Seq(0,3,1,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",4" // ABlockTransferDstScalarPerVector_K1
",true" // ABlockLdsAddExtraM
",Seq(1,4,8,2)" // BBlockTransferThreadClusterLengths_K0_N_K1
",Seq(0,3,1,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",4" // BBlockTransferDstScalarPerVector_K1
",true" // BBlockLdsAddExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
">";
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvMultipleDXdl)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,90 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
// Use the first instance from
// device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances<
2, // NDimSpatial
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default,
ck::BlockGemmPipelineScheduler::Intrawave,
ck::BlockGemmPipelineVersion::v1>;
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected string based on the first instance (BlockSize=32, MPerBlock=16, NPerBlock=16, etc.)
std::string expected_str =
"DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",32" // BlockSize
",16" // MPerBlock
",16" // NPerBlock
",32" // KPerBlock
",8" // ABK1
",16" // MPerWmma
",16" // NPerWmma
",1" // MRepeat
",1" // NRepeat
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",1" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",4" // ABlockTransferDstScalarPerVector_AK1
",false" // ABlockLdsAddExtraM
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",1" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",4" // BBlockTransferDstScalarPerVector_BK1
",false" // BBlockLdsAddExtraN
",1" // CShuffleMRepeatPerShuffle
",1" // CShuffleNRepeatPerShuffle
",Seq(1,4,1,8)" // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
",1" // CShuffleBlockTransferScalarPerVector_NPerBlock
",Intrawave" // BlkGemmPipeSched
",v1" // BlkGemmPipelineVer
",1" // NumGroupsToMerge
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // TransposeTransferSrcScalarPerVector
",1" // TransposeTransferDstScalarPerVector
">";
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvTwoStageWmmaV3)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,90 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
// Use the first instance from
// device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances<
2, // NDimSpatial
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default,
ck::BlockGemmPipelineScheduler::Intrawave,
ck::BlockGemmPipelineVersion::v1>;
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected string based on the first instance in the tuple
std::string expected_str =
"DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",16" // MPerBlock
",16" // NPerBlock
",32" // KPerBlock
",8" // K1
",16" // MPerXDL
",16" // NPerXDL
",1" // MXdlPerWave
",1" // NXdlPerWave
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_K0_M_K1
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",1" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",4" // ABlockTransferDstScalarPerVector_K1
",false" // ABlockLdsAddExtraM
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_K0_N_K1
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",1" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",4" // BBlockTransferDstScalarPerVector_K1
",false" // BBlockLdsAddExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,8,1,8)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
",1" // CBlockTransferScalarPerVector_NWaveNPerXdl
",Intrawave" // BlkGemmPipeSched
",v1" // BlkGemmPipelineVer
",1" // NumGroupsToMerge
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // TransposeTransferSrcScalarPerVector
",1" // TransposeTransferDstScalarPerVector
">";
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvTwoStageXdl)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,90 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// #ifdef _NOT_DEFINED_
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_wmma_c_shuffle_bf16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::NHWGC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::NHWGK, // OutLayout
ck::tensor_operation::device::instance::ConvBwdWeightDefault>;
// Expected complete instance string
std::string expected_str = "DeviceGroupedConvBwdWeight_Wmma_CShuffle"
"<2" // NDimSpatial
",NHWGC" // InLayout
",GKYXC" // WeiLayout
",NHWGK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",32" // MPerBlock
",32" // NPerBlock
",32" // KPerBlock
",8" // ABK1
",16" // MPerWmma
",16" // NPerWmma
",2" // MRepeat
",1" // NRepeat
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",1" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",2" // ABlockTransferDstScalarPerVector_AK1
",false" // ABlockLdsAddExtraM
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",1" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",2" // BBlockTransferDstScalarPerVector_BK1
",false" // BBlockLdsAddExtraN
",1" // CShuffleMRepeatPerShuffle
",1" // CShuffleNRepeatPerShuffle
",Seq(1,8,1,8)" // CShuffleBlockTransferClusterLengths
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
",Intrawave" // BlkGemmPipeSched
",v1" // BlkGemmPipelineVer
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // MaxTransposeTransferSrcScalarPerVector
",1" // MaxTransposeTransferDstScalarPerVector
">";
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Test describe() through base class pointer for WMMA V3 variant
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvWmmaV3)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace
// #endif

View File

@@ -0,0 +1,90 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// #ifdef _NOT_DEFINED_
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_v3_wmma_c_shuffle_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::NHWGC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::NHWGK, // OutLayout
ck::tensor_operation::device::instance::ConvBwdWeightDefault>;
// Expected complete instance string
std::string expected_str = "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3"
"<2" // NDimSpatial
",NHWGC" // InLayout
",GKYXC" // WeiLayout
",NHWGK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",32" // MPerBlock
",32" // NPerBlock
",32" // KPerBlock
",8" // ABK1
",16" // MPerWmma
",16" // NPerWmma
",2" // MRepeat
",1" // NRepeat
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",1" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",2" // ABlockTransferDstScalarPerVector_AK1
",false" // ABlockLdsAddExtraM
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",1" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",2" // BBlockTransferDstScalarPerVector_BK1
",false" // BBlockLdsAddExtraN
",1" // CShuffleMRepeatPerShuffle
",1" // CShuffleNRepeatPerShuffle
",Seq(1,8,1,8)" // CShuffleBlockTransferClusterLengths
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
",Intrawave" // BlkGemmPipeSched
",v1" // BlkGemmPipelineVer
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // MaxTransposeTransferSrcScalarPerVector
",1" // MaxTransposeTransferDstScalarPerVector
">";
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Test describe() through base class pointer for WMMA V3 variant
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvWmmaV3)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace
// #endif

View File

@@ -0,0 +1,86 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::GNHWK, // OutLayout
ck::tensor_operation::device::instance::ConvBwdWeightDefault, // ConvBwdWeightSpecialization
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
ck::BlockGemmPipelineVersion::v1>; // BlkGemmPipelineVer
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected complete instance string based on the generic instance
std::string expected_str =
"DeviceGroupedConvBwdWeight_Xdl_CShuffleV3"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",32" // MPerBlock
",32" // NPerBlock
",32" // K0PerBlock
",8" // K1
",32" // MPerXDL
",32" // NPerXDL
",1" // MXdlPerWave
",1" // NXdlPerWave
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_K0_M_K1
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",1" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",2" // ABlockTransferDstScalarPerVector_K1
",false" // ABlockLdsAddExtraM
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_K0_N_K1
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",1" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",2" // BBlockTransferDstScalarPerVector_K1
",false" // BBlockLdsAddExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,8,1,8)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
",Intrawave" // BlkGemmPipeSched
",v1" // BlkGemmPipelineVer
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
">";
// Test describe() through base class pointer for XDL V3 variant
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvXdlV3)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
auto desc = base_ptr->describe();
ASSERT_NE(desc, nullptr);
EXPECT_EQ(desc->instance_string(), expected_str);
}
} // namespace