mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 14:59:17 +00:00
[CK_BUILDER] Instance traits for conv bwd weight algorithms (#3498)
Added instance traits for the following bwd weight conv algorithms DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 DeviceGroupedConvBwdWeight_Wmma_CShuffleV3 DeviceGroupedConvBwdWeight_Wmma_CShuffle DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3 DeviceGroupedConvBwdWeight_DL DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3 Added also unit tests for instance traits of those bwd weigth algorithms that are currently exposed by the narrow CK build for MIOpen. --------- Co-authored-by: Ville Pietilä <>
This commit is contained in:
@@ -0,0 +1,272 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t M1PerThread,
|
||||
ck::index_t N1PerThread,
|
||||
ck::index_t KPerThread,
|
||||
typename M1N1ThreadClusterM1Xs,
|
||||
typename M1N1ThreadClusterN1Xs,
|
||||
typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
|
||||
typename ABlockTransferSrcVectorTensorContiguousDimOrder,
|
||||
typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
|
||||
typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
|
||||
typename BBlockTransferSrcVectorTensorContiguousDimOrder,
|
||||
typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
|
||||
typename CThreadTransferSrcDstAccessOrder,
|
||||
ck::index_t CThreadTransferSrcDstVectorDim,
|
||||
ck::index_t CThreadTransferDstScalarPerVector>
|
||||
struct DeviceGroupedConvBwdWeight_Dl;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t M1PerThread,
|
||||
ck::index_t N1PerThread,
|
||||
ck::index_t KPerThread,
|
||||
typename M1N1ThreadClusterM1Xs_,
|
||||
typename M1N1ThreadClusterN1Xs_,
|
||||
typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1_,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1_,
|
||||
typename ABlockTransferSrcVectorTensorContiguousDimOrder_,
|
||||
typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1_,
|
||||
typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1_,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1_,
|
||||
typename BBlockTransferSrcVectorTensorContiguousDimOrder_,
|
||||
typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1_,
|
||||
typename CThreadTransferSrcDstAccessOrder_,
|
||||
ck::index_t CThreadTransferSrcDstVectorDim,
|
||||
ck::index_t CThreadTransferDstScalarPerVector>
|
||||
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Dl<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
K0PerBlock,
|
||||
K1,
|
||||
M1PerThread,
|
||||
N1PerThread,
|
||||
KPerThread,
|
||||
M1N1ThreadClusterM1Xs_,
|
||||
M1N1ThreadClusterN1Xs_,
|
||||
ABlockTransferThreadSliceLengths_K0_M0_M1_K1_,
|
||||
ABlockTransferThreadClusterLengths_K0_M0_M1_K1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1_,
|
||||
ABlockTransferSrcVectorTensorContiguousDimOrder_,
|
||||
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1_,
|
||||
BBlockTransferThreadSliceLengths_K0_N0_N1_K1_,
|
||||
BBlockTransferThreadClusterLengths_K0_N0_N1_K1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1_,
|
||||
BBlockTransferSrcVectorTensorContiguousDimOrder_,
|
||||
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1_,
|
||||
CThreadTransferSrcDstAccessOrder_,
|
||||
CThreadTransferSrcDstVectorDim,
|
||||
CThreadTransferDstScalarPerVector>>
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Dl";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
|
||||
static constexpr ck::index_t kK1 = K1;
|
||||
static constexpr ck::index_t kM1PerThread = M1PerThread;
|
||||
static constexpr ck::index_t kN1PerThread = N1PerThread;
|
||||
static constexpr ck::index_t kKPerThread = KPerThread;
|
||||
|
||||
using M1N1ThreadClusterM1Xs = M1N1ThreadClusterM1Xs_;
|
||||
using M1N1ThreadClusterN1Xs = M1N1ThreadClusterN1Xs_;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_K0_M0_M1_K1 =
|
||||
ABlockTransferThreadSliceLengths_K0_M0_M1_K1_;
|
||||
using ABlockTransferThreadClusterLengths_K0_M0_M1_K1 =
|
||||
ABlockTransferThreadClusterLengths_K0_M0_M1_K1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
using ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 =
|
||||
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1_;
|
||||
using ABlockTransferSrcVectorTensorContiguousDimOrder =
|
||||
ABlockTransferSrcVectorTensorContiguousDimOrder_;
|
||||
using ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 =
|
||||
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1_;
|
||||
|
||||
using BBlockTransferThreadSliceLengths_K0_N0_N1_K1 =
|
||||
BBlockTransferThreadSliceLengths_K0_N0_N1_K1_;
|
||||
using BBlockTransferThreadClusterLengths_K0_N0_N1_K1 =
|
||||
BBlockTransferThreadClusterLengths_K0_N0_N1_K1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
using BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 =
|
||||
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1_;
|
||||
using BBlockTransferSrcVectorTensorContiguousDimOrder =
|
||||
BBlockTransferSrcVectorTensorContiguousDimOrder_;
|
||||
using BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 =
|
||||
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1_;
|
||||
|
||||
using CThreadTransferSrcDstAccessOrder = CThreadTransferSrcDstAccessOrder_;
|
||||
static constexpr ck::index_t kCThreadTransferSrcDstVectorDim = CThreadTransferSrcDstVectorDim;
|
||||
static constexpr ck::index_t kCThreadTransferDstScalarPerVector =
|
||||
CThreadTransferDstScalarPerVector;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeight_Dl";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 13. BlockSize
|
||||
oss << "," << kMPerBlock; // 14. MPerBlock
|
||||
oss << "," << kNPerBlock; // 15. NPerBlock
|
||||
oss << "," << kK0PerBlock; // 16. K0PerBlock
|
||||
oss << "," << kK1; // 17. K1
|
||||
oss << "," << kM1PerThread; // 18. M1PerThread
|
||||
oss << "," << kN1PerThread; // 19. N1PerThread
|
||||
oss << "," << kKPerThread; // 20. KPerThread
|
||||
oss << "," << detail::sequence_name<M1N1ThreadClusterM1Xs>(); // 21.
|
||||
oss << "," << detail::sequence_name<M1N1ThreadClusterN1Xs>(); // 22.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadSliceLengths_K0_M0_M1_K1>(); // 23.
|
||||
oss << ","
|
||||
<< detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M0_M1_K1>(); // 24.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 25.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 26.
|
||||
oss << ","
|
||||
<< detail::sequence_name<ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1>(); // 27.
|
||||
oss << ","
|
||||
<< detail::sequence_name<ABlockTransferSrcVectorTensorContiguousDimOrder>(); // 28.
|
||||
oss << ","
|
||||
<< detail::sequence_name<ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1>(); // 29.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadSliceLengths_K0_N0_N1_K1>(); // 30.
|
||||
oss << ","
|
||||
<< detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N0_N1_K1>(); // 31.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 32.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 33.
|
||||
oss << ","
|
||||
<< detail::sequence_name<BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1>(); // 34.
|
||||
oss << ","
|
||||
<< detail::sequence_name<BBlockTransferSrcVectorTensorContiguousDimOrder>(); // 35.
|
||||
oss << ","
|
||||
<< detail::sequence_name<BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1>(); // 36.
|
||||
oss << "," << detail::sequence_name<CThreadTransferSrcDstAccessOrder>(); // 37.
|
||||
oss << "," << kCThreadTransferSrcDstVectorDim; // 38.
|
||||
oss << "," << kCThreadTransferDstScalarPerVector; // 39.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,295 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename DsLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename DsDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t ABK1,
|
||||
ck::index_t MPerWmma,
|
||||
ck::index_t NPerWmma,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_AK1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_BK1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
typename ComputeTypeA,
|
||||
typename ComputeTypeB>
|
||||
struct DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename DsLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename DsDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t ABK1,
|
||||
ck::index_t MPerWmma,
|
||||
ck::index_t NPerWmma,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_AK0_M_AK1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_AK1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_BK0_N_BK1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_BK1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
typename ComputeTypeA_,
|
||||
typename ComputeTypeB_>
|
||||
struct InstanceTraits<
|
||||
ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
DsLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
DsDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
ABK1,
|
||||
MPerWmma,
|
||||
NPerWmma,
|
||||
MRepeat,
|
||||
NRepeat,
|
||||
ABlockTransferThreadClusterLengths_AK0_M_AK1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_AK1,
|
||||
ABlockLdsAddExtraM,
|
||||
BBlockTransferThreadClusterLengths_BK0_N_BK1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_BK1,
|
||||
BBlockLdsAddExtraN,
|
||||
CShuffleMRepeatPerShuffle,
|
||||
CShuffleNRepeatPerShuffle,
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
BlkGemmPipeSched,
|
||||
BlkGemmPipelineVer,
|
||||
ComputeTypeA_,
|
||||
ComputeTypeB_>>
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
using DsLayout = DsLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
using DsDataType = DsDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kKPerBlock = KPerBlock;
|
||||
static constexpr ck::index_t kABK1 = ABK1;
|
||||
static constexpr ck::index_t kMPerWmma = MPerWmma;
|
||||
static constexpr ck::index_t kNPerWmma = NPerWmma;
|
||||
static constexpr ck::index_t kMRepeat = MRepeat;
|
||||
static constexpr ck::index_t kNRepeat = NRepeat;
|
||||
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock;
|
||||
|
||||
using ABlockTransferThreadClusterLengths_AK0_M_AK1 =
|
||||
ABlockTransferThreadClusterLengths_AK0_M_AK1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
|
||||
ABlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kABlockTransferDstScalarPerVector_AK1 =
|
||||
ABlockTransferDstScalarPerVector_AK1;
|
||||
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
|
||||
|
||||
using BBlockTransferThreadClusterLengths_BK0_N_BK1 =
|
||||
BBlockTransferThreadClusterLengths_BK0_N_BK1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
|
||||
BBlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_BK1 =
|
||||
BBlockTransferDstScalarPerVector_BK1;
|
||||
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
|
||||
|
||||
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
|
||||
|
||||
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
|
||||
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
|
||||
|
||||
using ComputeTypeA = ComputeTypeA_;
|
||||
using ComputeTypeB = ComputeTypeB_;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::tuple_name<DsLayout>(); // 5. DsLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 6. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 7. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 8. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 9. AccDataType
|
||||
oss << "," << detail::tuple_name<DsDataType>(); // 10. DsDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 11. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 12.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 13.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 14. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 15. BlockSize
|
||||
oss << "," << kMPerBlock; // 16. MPerBlock
|
||||
oss << "," << kNPerBlock; // 17. NPerBlock
|
||||
oss << "," << kKPerBlock; // 18. KPerBlock
|
||||
oss << "," << kABK1; // 19. ABK1
|
||||
oss << "," << kMPerWmma; // 20. MPerWmma
|
||||
oss << "," << kNPerWmma; // 21. NPerWmma
|
||||
oss << "," << kMRepeat; // 22. MRepeat
|
||||
oss << "," << kNRepeat; // 23. NRepeat
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_AK0_M_AK1>(); // 24.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 25.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 26.
|
||||
oss << "," << kABlockTransferSrcVectorDim; // 27.
|
||||
oss << "," << kABlockTransferSrcScalarPerVector; // 28.
|
||||
oss << "," << kABlockTransferDstScalarPerVector_AK1; // 29.
|
||||
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 30.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_BK0_N_BK1>(); // 31.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 32.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 33.
|
||||
oss << "," << kBBlockTransferSrcVectorDim; // 34.
|
||||
oss << "," << kBBlockTransferSrcScalarPerVector; // 35.
|
||||
oss << "," << kBBlockTransferDstScalarPerVector_BK1; // 36.
|
||||
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 37.
|
||||
oss << "," << kCShuffleMRepeatPerShuffle; // 38.
|
||||
oss << "," << kCShuffleNRepeatPerShuffle; // 39.
|
||||
oss << ","
|
||||
<< detail::sequence_name<
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 40.
|
||||
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 41.
|
||||
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 42.
|
||||
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 43.
|
||||
oss << "," << detail::type_name<ComputeTypeA>(); // 44.
|
||||
oss << "," << detail::type_name<ComputeTypeB>(); // 45.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,282 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename DsLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename DsDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerXDL,
|
||||
ck::index_t NPerXDL,
|
||||
ck::index_t MXdlPerWave,
|
||||
ck::index_t NXdlPerWave,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMXdlPerWavePerShuffle,
|
||||
ck::index_t CShuffleNXdlPerWavePerShuffle,
|
||||
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
typename ComputeTypeA,
|
||||
typename ComputeTypeB>
|
||||
struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename DsLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename DsDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerXDL,
|
||||
ck::index_t NPerXDL,
|
||||
ck::index_t MXdlPerWave,
|
||||
ck::index_t NXdlPerWave,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMXdlPerWavePerShuffle,
|
||||
ck::index_t CShuffleNXdlPerWavePerShuffle,
|
||||
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
typename ComputeTypeA_,
|
||||
typename ComputeTypeB_>
|
||||
struct InstanceTraits<
|
||||
ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
DsLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
DsDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
K0PerBlock,
|
||||
K1,
|
||||
MPerXDL,
|
||||
NPerXDL,
|
||||
MXdlPerWave,
|
||||
NXdlPerWave,
|
||||
ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_K1,
|
||||
ABlockLdsAddExtraM,
|
||||
BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_K1,
|
||||
BBlockLdsAddExtraN,
|
||||
CShuffleMXdlPerWavePerShuffle,
|
||||
CShuffleNXdlPerWavePerShuffle,
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
ComputeTypeA_,
|
||||
ComputeTypeB_>>
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
using DsLayout = DsLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
using DsDataType = DsDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
|
||||
static constexpr ck::index_t kK1 = K1;
|
||||
static constexpr ck::index_t kMPerXDL = MPerXDL;
|
||||
static constexpr ck::index_t kNPerXDL = NPerXDL;
|
||||
static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
|
||||
static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
|
||||
static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
|
||||
static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
|
||||
static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
|
||||
CBlockTransferScalarPerVector_NWaveNPerXdl;
|
||||
|
||||
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
|
||||
ABlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
|
||||
ABlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
|
||||
|
||||
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
|
||||
BBlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
|
||||
BBlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
|
||||
|
||||
using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
|
||||
|
||||
using ComputeTypeA = ComputeTypeA_;
|
||||
using ComputeTypeB = ComputeTypeB_;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::tuple_name<DsLayout>(); // 5. DsLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 6. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 7. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 8. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 9. AccDataType
|
||||
oss << "," << detail::tuple_name<DsDataType>(); // 10. DsDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 11. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 12.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 13.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 14. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 15. BlockSize
|
||||
oss << "," << kMPerBlock; // 16. MPerBlock
|
||||
oss << "," << kNPerBlock; // 17. NPerBlock
|
||||
oss << "," << kK0PerBlock; // 18. K0PerBlock
|
||||
oss << "," << kK1; // 19. K1
|
||||
oss << "," << kMPerXDL; // 20. MPerXDL
|
||||
oss << "," << kNPerXDL; // 21. NPerXDL
|
||||
oss << "," << kMXdlPerWave; // 22. MXdlPerWave
|
||||
oss << "," << kNXdlPerWave; // 23. NXdlPerWave
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 24.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 25.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 26.
|
||||
oss << "," << kABlockTransferSrcVectorDim; // 27.
|
||||
oss << "," << kABlockTransferSrcScalarPerVector; // 28.
|
||||
oss << "," << kABlockTransferDstScalarPerVector_K1; // 29.
|
||||
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 30.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 31.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 32.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 33.
|
||||
oss << "," << kBBlockTransferSrcVectorDim; // 34.
|
||||
oss << "," << kBBlockTransferSrcScalarPerVector; // 35.
|
||||
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 36.
|
||||
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 37.
|
||||
oss << "," << kCShuffleMXdlPerWavePerShuffle; // 38.
|
||||
oss << "," << kCShuffleNXdlPerWavePerShuffle; // 39.
|
||||
oss << ","
|
||||
<< detail::sequence_name<
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 40.
|
||||
oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 41.
|
||||
oss << "," << detail::type_name<ComputeTypeA>(); // 42.
|
||||
oss << "," << detail::type_name<ComputeTypeB>(); // 43.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,302 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t ABK1,
|
||||
ck::index_t MPerWmma,
|
||||
ck::index_t NPerWmma,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_AK1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_BK1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
ck::index_t NumGroupsToMerge,
|
||||
typename ComputeTypeA,
|
||||
typename ComputeTypeB,
|
||||
ck::index_t TransposeTransferSrcScalarPerVector,
|
||||
ck::index_t TransposeTransferDstScalarPerVector>
|
||||
struct DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t ABK1,
|
||||
ck::index_t MPerWmma,
|
||||
ck::index_t NPerWmma,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_AK0_M_AK1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_AK1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_BK0_N_BK1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_BK1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
ck::index_t NumGroupsToMerge,
|
||||
typename ComputeTypeA_,
|
||||
typename ComputeTypeB_,
|
||||
ck::index_t TransposeTransferSrcScalarPerVector,
|
||||
ck::index_t TransposeTransferDstScalarPerVector>
|
||||
struct InstanceTraits<
|
||||
ck::tensor_operation::device::DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
ABK1,
|
||||
MPerWmma,
|
||||
NPerWmma,
|
||||
MRepeat,
|
||||
NRepeat,
|
||||
ABlockTransferThreadClusterLengths_AK0_M_AK1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_AK1,
|
||||
ABlockLdsAddExtraM,
|
||||
BBlockTransferThreadClusterLengths_BK0_N_BK1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_BK1,
|
||||
BBlockLdsAddExtraN,
|
||||
CShuffleMRepeatPerShuffle,
|
||||
CShuffleNRepeatPerShuffle,
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
BlkGemmPipeSched,
|
||||
BlkGemmPipelineVer,
|
||||
NumGroupsToMerge,
|
||||
ComputeTypeA_,
|
||||
ComputeTypeB_,
|
||||
TransposeTransferSrcScalarPerVector,
|
||||
TransposeTransferDstScalarPerVector>>
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kKPerBlock = KPerBlock;
|
||||
static constexpr ck::index_t kABK1 = ABK1;
|
||||
static constexpr ck::index_t kMPerWmma = MPerWmma;
|
||||
static constexpr ck::index_t kNPerWmma = NPerWmma;
|
||||
static constexpr ck::index_t kMRepeat = MRepeat;
|
||||
static constexpr ck::index_t kNRepeat = NRepeat;
|
||||
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock;
|
||||
static constexpr ck::index_t kNumGroupsToMerge = NumGroupsToMerge;
|
||||
static constexpr ck::index_t kTransposeTransferSrcScalarPerVector =
|
||||
TransposeTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kTransposeTransferDstScalarPerVector =
|
||||
TransposeTransferDstScalarPerVector;
|
||||
|
||||
using ABlockTransferThreadClusterLengths_AK0_M_AK1 =
|
||||
ABlockTransferThreadClusterLengths_AK0_M_AK1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
|
||||
ABlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kABlockTransferDstScalarPerVector_AK1 =
|
||||
ABlockTransferDstScalarPerVector_AK1;
|
||||
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
|
||||
|
||||
using BBlockTransferThreadClusterLengths_BK0_N_BK1 =
|
||||
BBlockTransferThreadClusterLengths_BK0_N_BK1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
|
||||
BBlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_BK1 =
|
||||
BBlockTransferDstScalarPerVector_BK1;
|
||||
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
|
||||
|
||||
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
|
||||
|
||||
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
|
||||
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
|
||||
|
||||
using ComputeTypeA = ComputeTypeA_;
|
||||
using ComputeTypeB = ComputeTypeB_;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 13. BlockSize
|
||||
oss << "," << kMPerBlock; // 14. MPerBlock
|
||||
oss << "," << kNPerBlock; // 15. NPerBlock
|
||||
oss << "," << kKPerBlock; // 16. KPerBlock
|
||||
oss << "," << kABK1; // 17. ABK1
|
||||
oss << "," << kMPerWmma; // 18. MPerWmma
|
||||
oss << "," << kNPerWmma; // 19. NPerWmma
|
||||
oss << "," << kMRepeat; // 20. MRepeat
|
||||
oss << "," << kNRepeat; // 21. NRepeat
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_AK0_M_AK1>(); // 22.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
|
||||
oss << "," << kABlockTransferSrcVectorDim; // 25.
|
||||
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
|
||||
oss << "," << kABlockTransferDstScalarPerVector_AK1; // 27.
|
||||
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_BK0_N_BK1>(); // 29.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
|
||||
oss << "," << kBBlockTransferSrcVectorDim; // 32.
|
||||
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
|
||||
oss << "," << kBBlockTransferDstScalarPerVector_BK1; // 34.
|
||||
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
|
||||
oss << "," << kCShuffleMRepeatPerShuffle; // 36.
|
||||
oss << "," << kCShuffleNRepeatPerShuffle; // 37.
|
||||
oss << ","
|
||||
<< detail::sequence_name<
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
|
||||
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 39.
|
||||
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
|
||||
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
|
||||
oss << "," << kNumGroupsToMerge; // 42.
|
||||
oss << "," << detail::type_name<ComputeTypeA>(); // 43.
|
||||
oss << "," << detail::type_name<ComputeTypeB>(); // 44.
|
||||
oss << "," << kTransposeTransferSrcScalarPerVector; // 45.
|
||||
oss << "," << kTransposeTransferDstScalarPerVector; // 46.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,299 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerXDL,
|
||||
ck::index_t NPerXDL,
|
||||
ck::index_t MXdlPerWave,
|
||||
ck::index_t NXdlPerWave,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMXdlPerWavePerShuffle,
|
||||
ck::index_t CShuffleNXdlPerWavePerShuffle,
|
||||
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
ck::index_t NumGroupsToMerge,
|
||||
typename ComputeTypeA,
|
||||
typename ComputeTypeB,
|
||||
ck::index_t TransposeTransferSrcScalarPerVector,
|
||||
ck::index_t TransposeTransferDstScalarPerVector>
|
||||
struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerXDL,
|
||||
ck::index_t NPerXDL,
|
||||
ck::index_t MXdlPerWave,
|
||||
ck::index_t NXdlPerWave,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMXdlPerWavePerShuffle,
|
||||
ck::index_t CShuffleNXdlPerWavePerShuffle,
|
||||
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
ck::index_t NumGroupsToMerge,
|
||||
typename ComputeTypeA_,
|
||||
typename ComputeTypeB_,
|
||||
ck::index_t TransposeTransferSrcScalarPerVector,
|
||||
ck::index_t TransposeTransferDstScalarPerVector>
|
||||
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
K1,
|
||||
MPerXDL,
|
||||
NPerXDL,
|
||||
MXdlPerWave,
|
||||
NXdlPerWave,
|
||||
ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_K1,
|
||||
ABlockLdsAddExtraM,
|
||||
BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_K1,
|
||||
BBlockLdsAddExtraN,
|
||||
CShuffleMXdlPerWavePerShuffle,
|
||||
CShuffleNXdlPerWavePerShuffle,
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
BlkGemmPipeSched,
|
||||
BlkGemmPipelineVer,
|
||||
NumGroupsToMerge,
|
||||
ComputeTypeA_,
|
||||
ComputeTypeB_,
|
||||
TransposeTransferSrcScalarPerVector,
|
||||
TransposeTransferDstScalarPerVector>>
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kKPerBlock = KPerBlock;
|
||||
static constexpr ck::index_t kK1 = K1;
|
||||
static constexpr ck::index_t kMPerXDL = MPerXDL;
|
||||
static constexpr ck::index_t kNPerXDL = NPerXDL;
|
||||
static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
|
||||
static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
|
||||
static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
|
||||
static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
|
||||
static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
|
||||
CBlockTransferScalarPerVector_NWaveNPerXdl;
|
||||
static constexpr ck::index_t kNumGroupsToMerge = NumGroupsToMerge;
|
||||
static constexpr ck::index_t kTransposeTransferSrcScalarPerVector =
|
||||
TransposeTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kTransposeTransferDstScalarPerVector =
|
||||
TransposeTransferDstScalarPerVector;
|
||||
|
||||
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
|
||||
ABlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
|
||||
ABlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
|
||||
|
||||
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
|
||||
BBlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
|
||||
BBlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
|
||||
|
||||
using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
|
||||
|
||||
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
|
||||
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
|
||||
|
||||
using ComputeTypeA = ComputeTypeA_;
|
||||
using ComputeTypeB = ComputeTypeB_;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 13. BlockSize
|
||||
oss << "," << kMPerBlock; // 14. MPerBlock
|
||||
oss << "," << kNPerBlock; // 15. NPerBlock
|
||||
oss << "," << kKPerBlock; // 16. KPerBlock
|
||||
oss << "," << kK1; // 17. K1
|
||||
oss << "," << kMPerXDL; // 18. MPerXDL
|
||||
oss << "," << kNPerXDL; // 19. NPerXDL
|
||||
oss << "," << kMXdlPerWave; // 20. MXdlPerWave
|
||||
oss << "," << kNXdlPerWave; // 21. NXdlPerWave
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
|
||||
oss << "," << kABlockTransferSrcVectorDim; // 25.
|
||||
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
|
||||
oss << "," << kABlockTransferDstScalarPerVector_K1; // 27.
|
||||
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
|
||||
oss << "," << kBBlockTransferSrcVectorDim; // 32.
|
||||
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
|
||||
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34.
|
||||
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
|
||||
oss << "," << kCShuffleMXdlPerWavePerShuffle; // 36.
|
||||
oss << "," << kCShuffleNXdlPerWavePerShuffle; // 37.
|
||||
oss << ","
|
||||
<< detail::sequence_name<
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
|
||||
oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 39.
|
||||
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
|
||||
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
|
||||
oss << "," << kNumGroupsToMerge; // 42.
|
||||
oss << "," << detail::type_name<ComputeTypeA>(); // 43.
|
||||
oss << "," << detail::type_name<ComputeTypeB>(); // 44.
|
||||
oss << "," << kTransposeTransferSrcScalarPerVector; // 45.
|
||||
oss << "," << kTransposeTransferDstScalarPerVector; // 46.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,277 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ConvolutionBackwardWeightSpecialization ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerWMMA,
|
||||
ck::index_t NPerWMMA,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::index_t NumGemmKPrefetchStage,
|
||||
ck::LoopScheduler LoopSched,
|
||||
ck::PipelineVersion PipelineVer,
|
||||
typename ck::enable_if<NDimSpatial == 3, bool>::type>
|
||||
struct DeviceGroupedConvBwdWeight_Wmma_CShuffle;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerWMMA,
|
||||
ck::index_t NPerWMMA,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::index_t NumGemmKPrefetchStage,
|
||||
ck::LoopScheduler LoopSched,
|
||||
ck::PipelineVersion PipelineVer>
|
||||
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Wmma_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
K0PerBlock,
|
||||
K1,
|
||||
MPerWMMA,
|
||||
NPerWMMA,
|
||||
MRepeat,
|
||||
NRepeat,
|
||||
ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_K1,
|
||||
ABlockLdsAddExtraM,
|
||||
BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_K1,
|
||||
BBlockLdsAddExtraN,
|
||||
CShuffleMRepeatPerShuffle,
|
||||
CShuffleNRepeatPerShuffle,
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
NumGemmKPrefetchStage,
|
||||
LoopSched,
|
||||
PipelineVer,
|
||||
false>> // Use false to match with the default value
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Wmma_CShuffle";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
|
||||
static constexpr ck::index_t kK1 = K1;
|
||||
static constexpr ck::index_t kMPerWMMA = MPerWMMA;
|
||||
static constexpr ck::index_t kNPerWMMA = NPerWMMA;
|
||||
static constexpr ck::index_t kMRepeat = MRepeat;
|
||||
static constexpr ck::index_t kNRepeat = NRepeat;
|
||||
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock;
|
||||
static constexpr ck::index_t kNumGemmKPrefetchStage = NumGemmKPrefetchStage;
|
||||
|
||||
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
|
||||
ABlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
|
||||
ABlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
|
||||
|
||||
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
|
||||
BBlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
|
||||
BBlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
|
||||
|
||||
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
|
||||
|
||||
static constexpr ck::LoopScheduler kLoopSched = LoopSched;
|
||||
static constexpr ck::PipelineVersion kPipelineVer = PipelineVer;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeight_Wmma_CShuffle";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 13. BlockSize
|
||||
oss << "," << kMPerBlock; // 14. MPerBlock
|
||||
oss << "," << kNPerBlock; // 15. NPerBlock
|
||||
oss << "," << kK0PerBlock; // 16. K0PerBlock
|
||||
oss << "," << kK1; // 17. K1
|
||||
oss << "," << kMPerWMMA; // 18. MPerWMMA
|
||||
oss << "," << kNPerWMMA; // 19. NPerWMMA
|
||||
oss << "," << kMRepeat; // 20. MRepeat
|
||||
oss << "," << kNRepeat; // 21. NRepeat
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
|
||||
oss << "," << kABlockTransferSrcVectorDim; // 25.
|
||||
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
|
||||
oss << "," << kABlockTransferDstScalarPerVector_K1; // 27.
|
||||
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
|
||||
oss << "," << kBBlockTransferSrcVectorDim; // 32.
|
||||
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
|
||||
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34.
|
||||
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
|
||||
oss << "," << kCShuffleMRepeatPerShuffle; // 36.
|
||||
oss << "," << kCShuffleNRepeatPerShuffle; // 37.
|
||||
oss << ","
|
||||
<< detail::sequence_name<
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
|
||||
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 39.
|
||||
oss << "," << kNumGemmKPrefetchStage; // 40.
|
||||
oss << "," << detail::loop_scheduler_name(kLoopSched); // 41.
|
||||
oss << "," << detail::pipeline_version_name(kPipelineVer); // 42.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,296 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t ABK1,
|
||||
ck::index_t MPerWmma,
|
||||
ck::index_t NPerWmma,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_AK1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_BK1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
typename ComputeTypeA,
|
||||
typename ComputeTypeB,
|
||||
ck::index_t MaxTransposeTransferSrcScalarPerVector,
|
||||
ck::index_t MaxTransposeTransferDstScalarPerVector>
|
||||
struct DeviceGroupedConvBwdWeight_Wmma_CShuffleV3;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t ABK1,
|
||||
ck::index_t MPerWmma,
|
||||
ck::index_t NPerWmma,
|
||||
ck::index_t MRepeat,
|
||||
ck::index_t NRepeat,
|
||||
typename ABlockTransferThreadClusterLengths_AK0_M_AK1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_AK1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_BK0_N_BK1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_BK1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMRepeatPerShuffle,
|
||||
ck::index_t CShuffleNRepeatPerShuffle,
|
||||
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
typename ComputeTypeA_,
|
||||
typename ComputeTypeB_,
|
||||
ck::index_t MaxTransposeTransferSrcScalarPerVector,
|
||||
ck::index_t MaxTransposeTransferDstScalarPerVector>
|
||||
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Wmma_CShuffleV3<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
ABK1,
|
||||
MPerWmma,
|
||||
NPerWmma,
|
||||
MRepeat,
|
||||
NRepeat,
|
||||
ABlockTransferThreadClusterLengths_AK0_M_AK1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_AK1,
|
||||
ABlockLdsAddExtraM,
|
||||
BBlockTransferThreadClusterLengths_BK0_N_BK1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_BK1,
|
||||
BBlockLdsAddExtraN,
|
||||
CShuffleMRepeatPerShuffle,
|
||||
CShuffleNRepeatPerShuffle,
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock,
|
||||
BlkGemmPipeSched,
|
||||
BlkGemmPipelineVer,
|
||||
ComputeTypeA_,
|
||||
ComputeTypeB_,
|
||||
MaxTransposeTransferSrcScalarPerVector,
|
||||
MaxTransposeTransferDstScalarPerVector>>
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kKPerBlock = KPerBlock;
|
||||
static constexpr ck::index_t kABK1 = ABK1;
|
||||
static constexpr ck::index_t kMPerWmma = MPerWmma;
|
||||
static constexpr ck::index_t kNPerWmma = NPerWmma;
|
||||
static constexpr ck::index_t kMRepeat = MRepeat;
|
||||
static constexpr ck::index_t kNRepeat = NRepeat;
|
||||
static constexpr ck::index_t kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle;
|
||||
static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_NPerBlock =
|
||||
CShuffleBlockTransferScalarPerVector_NPerBlock;
|
||||
static constexpr ck::index_t kMaxTransposeTransferSrcScalarPerVector =
|
||||
MaxTransposeTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kMaxTransposeTransferDstScalarPerVector =
|
||||
MaxTransposeTransferDstScalarPerVector;
|
||||
|
||||
using ABlockTransferThreadClusterLengths_AK0_M_AK1 =
|
||||
ABlockTransferThreadClusterLengths_AK0_M_AK1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
|
||||
ABlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kABlockTransferDstScalarPerVector_AK1 =
|
||||
ABlockTransferDstScalarPerVector_AK1;
|
||||
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
|
||||
|
||||
using BBlockTransferThreadClusterLengths_BK0_N_BK1 =
|
||||
BBlockTransferThreadClusterLengths_BK0_N_BK1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
|
||||
BBlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_BK1 =
|
||||
BBlockTransferDstScalarPerVector_BK1;
|
||||
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
|
||||
|
||||
using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
|
||||
|
||||
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
|
||||
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
|
||||
|
||||
using ComputeTypeA = ComputeTypeA_;
|
||||
using ComputeTypeB = ComputeTypeB_;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 13. BlockSize
|
||||
oss << "," << kMPerBlock; // 14. MPerBlock
|
||||
oss << "," << kNPerBlock; // 15. NPerBlock
|
||||
oss << "," << kKPerBlock; // 16. KPerBlock
|
||||
oss << "," << kABK1; // 17. ABK1
|
||||
oss << "," << kMPerWmma; // 18. MPerWmma
|
||||
oss << "," << kNPerWmma; // 19. NPerWmma
|
||||
oss << "," << kMRepeat; // 20. MRepeat
|
||||
oss << "," << kNRepeat; // 21. NRepeat
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_AK0_M_AK1>(); // 22.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
|
||||
oss << "," << kABlockTransferSrcVectorDim; // 25.
|
||||
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
|
||||
oss << "," << kABlockTransferDstScalarPerVector_AK1; // 27.
|
||||
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_BK0_N_BK1>(); // 29.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
|
||||
oss << "," << kBBlockTransferSrcVectorDim; // 32.
|
||||
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
|
||||
oss << "," << kBBlockTransferDstScalarPerVector_BK1; // 34.
|
||||
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
|
||||
oss << "," << kCShuffleMRepeatPerShuffle; // 36.
|
||||
oss << "," << kCShuffleNRepeatPerShuffle; // 37.
|
||||
oss << ","
|
||||
<< detail::sequence_name<
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
|
||||
oss << "," << kCShuffleBlockTransferScalarPerVector_NPerBlock; // 39.
|
||||
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
|
||||
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
|
||||
oss << "," << detail::type_name<ComputeTypeA>(); // 42.
|
||||
oss << "," << detail::type_name<ComputeTypeB>(); // 43.
|
||||
oss << "," << kMaxTransposeTransferSrcScalarPerVector; // 44.
|
||||
oss << "," << kMaxTransposeTransferDstScalarPerVector; // 45.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,284 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "instance_traits.hpp"
|
||||
#include "instance_traits_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
|
||||
|
||||
// Forward declaration to avoid circular dependency
|
||||
namespace ck::tensor_operation::device {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerXDL,
|
||||
ck::index_t NPerXDL,
|
||||
ck::index_t MXdlPerWave,
|
||||
ck::index_t NXdlPerWave,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1,
|
||||
typename ABlockTransferThreadClusterArrangeOrder,
|
||||
typename ABlockTransferSrcAccessOrder,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1,
|
||||
typename BBlockTransferThreadClusterArrangeOrder,
|
||||
typename BBlockTransferSrcAccessOrder,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMXdlPerWavePerShuffle,
|
||||
ck::index_t CShuffleNXdlPerWavePerShuffle,
|
||||
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
typename ComputeTypeA,
|
||||
typename ComputeTypeB>
|
||||
struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3;
|
||||
|
||||
} // namespace ck::tensor_operation::device
|
||||
|
||||
namespace ck_tile {
|
||||
namespace reflect {
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout_,
|
||||
typename WeiLayout_,
|
||||
typename OutLayout_,
|
||||
typename InDataType_,
|
||||
typename WeiDataType_,
|
||||
typename OutDataType_,
|
||||
typename AccDataType_,
|
||||
typename InElementwiseOperation_,
|
||||
typename WeiElementwiseOperation_,
|
||||
typename OutElementwiseOperation_,
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
|
||||
ConvBackwardWeightSpecialization,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t MPerBlock,
|
||||
ck::index_t NPerBlock,
|
||||
ck::index_t K0PerBlock,
|
||||
ck::index_t K1,
|
||||
ck::index_t MPerXDL,
|
||||
ck::index_t NPerXDL,
|
||||
ck::index_t MXdlPerWave,
|
||||
ck::index_t NXdlPerWave,
|
||||
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
typename ABlockTransferThreadClusterArrangeOrder_,
|
||||
typename ABlockTransferSrcAccessOrder_,
|
||||
ck::index_t ABlockTransferSrcVectorDim,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
||||
bool ABlockLdsAddExtraM,
|
||||
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
typename BBlockTransferThreadClusterArrangeOrder_,
|
||||
typename BBlockTransferSrcAccessOrder_,
|
||||
ck::index_t BBlockTransferSrcVectorDim,
|
||||
ck::index_t BBlockTransferSrcScalarPerVector,
|
||||
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
||||
bool BBlockLdsAddExtraN,
|
||||
ck::index_t CShuffleMXdlPerWavePerShuffle,
|
||||
ck::index_t CShuffleNXdlPerWavePerShuffle,
|
||||
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
ck::BlockGemmPipelineScheduler BlkGemmPipeSched,
|
||||
ck::BlockGemmPipelineVersion BlkGemmPipelineVer,
|
||||
typename ComputeTypeA_,
|
||||
typename ComputeTypeB_>
|
||||
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<
|
||||
NDimSpatial,
|
||||
InLayout_,
|
||||
WeiLayout_,
|
||||
OutLayout_,
|
||||
InDataType_,
|
||||
WeiDataType_,
|
||||
OutDataType_,
|
||||
AccDataType_,
|
||||
InElementwiseOperation_,
|
||||
WeiElementwiseOperation_,
|
||||
OutElementwiseOperation_,
|
||||
ConvBackwardWeightSpecialization,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
NPerBlock,
|
||||
K0PerBlock,
|
||||
K1,
|
||||
MPerXDL,
|
||||
NPerXDL,
|
||||
MXdlPerWave,
|
||||
NXdlPerWave,
|
||||
ABlockTransferThreadClusterLengths_K0_M_K1_,
|
||||
ABlockTransferThreadClusterArrangeOrder_,
|
||||
ABlockTransferSrcAccessOrder_,
|
||||
ABlockTransferSrcVectorDim,
|
||||
ABlockTransferSrcScalarPerVector,
|
||||
ABlockTransferDstScalarPerVector_K1,
|
||||
ABlockLdsAddExtraM,
|
||||
BBlockTransferThreadClusterLengths_K0_N_K1_,
|
||||
BBlockTransferThreadClusterArrangeOrder_,
|
||||
BBlockTransferSrcAccessOrder_,
|
||||
BBlockTransferSrcVectorDim,
|
||||
BBlockTransferSrcScalarPerVector,
|
||||
BBlockTransferDstScalarPerVector_K1,
|
||||
BBlockLdsAddExtraN,
|
||||
CShuffleMXdlPerWavePerShuffle,
|
||||
CShuffleNXdlPerWavePerShuffle,
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
|
||||
CBlockTransferScalarPerVector_NWaveNPerXdl,
|
||||
BlkGemmPipeSched,
|
||||
BlkGemmPipelineVer,
|
||||
ComputeTypeA_,
|
||||
ComputeTypeB_>>
|
||||
{
|
||||
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3";
|
||||
|
||||
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
|
||||
|
||||
using InLayout = InLayout_;
|
||||
using WeiLayout = WeiLayout_;
|
||||
using OutLayout = OutLayout_;
|
||||
|
||||
using InDataType = InDataType_;
|
||||
using WeiDataType = WeiDataType_;
|
||||
using OutDataType = OutDataType_;
|
||||
using AccDataType = AccDataType_;
|
||||
|
||||
using InElementwiseOperation = InElementwiseOperation_;
|
||||
using WeiElementwiseOperation = WeiElementwiseOperation_;
|
||||
using OutElementwiseOperation = OutElementwiseOperation_;
|
||||
|
||||
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
|
||||
|
||||
static constexpr ck::index_t kBlockSize = BlockSize;
|
||||
static constexpr ck::index_t kMPerBlock = MPerBlock;
|
||||
static constexpr ck::index_t kNPerBlock = NPerBlock;
|
||||
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
|
||||
static constexpr ck::index_t kK1 = K1;
|
||||
static constexpr ck::index_t kMPerXDL = MPerXDL;
|
||||
static constexpr ck::index_t kNPerXDL = NPerXDL;
|
||||
static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
|
||||
static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
|
||||
|
||||
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
|
||||
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
|
||||
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
|
||||
ABlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
|
||||
ABlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
|
||||
|
||||
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
|
||||
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
|
||||
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
|
||||
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
|
||||
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
|
||||
BBlockTransferSrcScalarPerVector;
|
||||
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
|
||||
BBlockTransferDstScalarPerVector_K1;
|
||||
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
|
||||
|
||||
static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
|
||||
static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
|
||||
|
||||
using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
|
||||
static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
|
||||
CBlockTransferScalarPerVector_NWaveNPerXdl;
|
||||
|
||||
static constexpr ck::BlockGemmPipelineScheduler kBlkGemmPipeSched = BlkGemmPipeSched;
|
||||
static constexpr ck::BlockGemmPipelineVersion kBlkGemmPipelineVer = BlkGemmPipelineVer;
|
||||
|
||||
using ComputeTypeA = ComputeTypeA_;
|
||||
using ComputeTypeB = ComputeTypeB_;
|
||||
|
||||
// Static member function to generate instance string
|
||||
static std::string instance_string()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
|
||||
// Kernel type name
|
||||
oss << "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3";
|
||||
|
||||
// Template parameters in exact order
|
||||
oss << "<" << kNDimSpatial; // 1. NDimSpatial
|
||||
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
|
||||
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
|
||||
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
|
||||
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
|
||||
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
|
||||
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
|
||||
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
|
||||
// WeiElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
|
||||
// OutElementwiseOperation
|
||||
oss << ","
|
||||
<< detail::conv_bwd_weight_spec_name(
|
||||
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
|
||||
oss << "," << kBlockSize; // 13. BlockSize
|
||||
oss << "," << kMPerBlock; // 14. MPerBlock
|
||||
oss << "," << kNPerBlock; // 15. NPerBlock
|
||||
oss << "," << kK0PerBlock; // 16. K0PerBlock
|
||||
oss << "," << kK1; // 17. K1
|
||||
oss << "," << kMPerXDL; // 18. MPerXDL
|
||||
oss << "," << kNPerXDL; // 19. NPerXDL
|
||||
oss << "," << kMXdlPerWave; // 20. MXdlPerWave
|
||||
oss << "," << kNXdlPerWave; // 21. NXdlPerWave
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
|
||||
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
|
||||
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
|
||||
oss << "," << kABlockTransferSrcVectorDim; // 25.
|
||||
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
|
||||
oss << "," << kABlockTransferDstScalarPerVector_K1; // 27.
|
||||
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
|
||||
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
|
||||
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
|
||||
oss << "," << kBBlockTransferSrcVectorDim; // 32.
|
||||
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
|
||||
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34.
|
||||
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
|
||||
oss << "," << kCShuffleMXdlPerWavePerShuffle; // 36.
|
||||
oss << "," << kCShuffleNXdlPerWavePerShuffle; // 37.
|
||||
oss << ","
|
||||
<< detail::sequence_name<
|
||||
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
|
||||
oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 39.
|
||||
oss << "," << detail::pipeline_scheduler_name(kBlkGemmPipeSched); // 40.
|
||||
oss << "," << detail::pipeline_version_name(kBlkGemmPipelineVer); // 41.
|
||||
oss << "," << detail::type_name<ComputeTypeA>(); // 42.
|
||||
oss << "," << detail::type_name<ComputeTypeB>(); // 43.
|
||||
oss << ">";
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace reflect
|
||||
} // namespace ck_tile
|
||||
@@ -119,13 +119,29 @@ add_ck_builder_test(test_ckb_conv_builder
|
||||
# - Group convolution (v3, standard, large tensor, WMMA, DL variants)
|
||||
# - Backward weight group convolution (XDL)
|
||||
# Requires kernel compilation to validate the generated strings through the base class.
|
||||
add_ck_builder_test(test_ckb_instance_string
|
||||
|
||||
set(INSTANCE_STRING_TESTS
|
||||
test_instance_string_fwd_grp_conv_v3.cpp
|
||||
test_instance_string_fwd_grp_conv.cpp
|
||||
test_instance_string_fwd_grp_conv_large_tensor.cpp
|
||||
test_instance_string_fwd_grp_conv_wmma.cpp
|
||||
test_instance_string_fwd_grp_conv_dl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_xdl.cpp)
|
||||
test_instance_string_bwd_weight_grp_conv_xdl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_dl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_multiple_d_xdl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_two_stage_xdl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_xdl_v3.cpp
|
||||
)
|
||||
|
||||
if (CK_USE_WMMA)
|
||||
list(APPEND INSTANCE_STRING_TESTS
|
||||
test_instance_string_bwd_weight_grp_conv_wmma_v3.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_multiple_d_wmma_v3.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_two_stage_wmma_v3.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
add_ck_builder_test(test_ckb_instance_string ${INSTANCE_STRING_TESTS})
|
||||
|
||||
# Tests the forward convolution builder across multiple data types and dimensions.
|
||||
# Individual tests are split into separate files to enable parallel compilation.
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from device_grouped_conv_bwd_weight_dl_f16_instances
|
||||
using InstanceTuple =
|
||||
ck::tensor_operation::device::instance::device_grouped_conv_bwd_weight_dl_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the generic instance
|
||||
std::string expected_str = "DeviceGroupedConvBwdWeight_Dl"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",256" // BlockSize
|
||||
",128" // MPerBlock
|
||||
",128" // NPerBlock
|
||||
",16" // K0PerBlock
|
||||
",1" // K1
|
||||
",4" // M1PerThread
|
||||
",4" // N1PerThread
|
||||
",1" // KPerThread
|
||||
",Seq(8,2)" // M1N1ThreadClusterM1Xs
|
||||
",Seq(8,2)" // M1N1ThreadClusterN1Xs
|
||||
",Seq(1,8,1,1,1)" // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
",Seq(1,2,1,128,1)" // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
",Seq(0,2,3,1,4)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,2,3,1,4)" // ABlockTransferSrcAccessOrder
|
||||
",Seq(1,1,1,1,1)" // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
",Seq(0,2,3,1,4)" // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
",Seq(1,1,1,1,1)" // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
",Seq(1,1,1,8,1)" // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
",Seq(1,16,1,16,1)" // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
",Seq(0,1,4,2,3)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,1,4,2,3)" // BBlockTransferSrcAccessOrder
|
||||
",Seq(1,1,1,1,1)" // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
",Seq(0,1,4,2,3)" // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
",Seq(1,1,1,1,1)" // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
",Seq(0,1,2,3,4,5)" // CThreadTransferSrcDstAccessOrder
|
||||
",5" // CThreadTransferSrcDstVectorDim
|
||||
",1" // CThreadTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvDl)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,86 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the generic instance
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",EmptyTuple" // DsLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",EmptyTuple" // DsDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",Scale" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",64" // MPerBlock
|
||||
",64" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",4" // MRepeat
|
||||
",2" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_AK1
|
||||
",true" // ABlockLdsAddExtraM
|
||||
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_BK1
|
||||
",true" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,16,1,4)" // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvMultipleDWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,84 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the generic instance
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",EmptyTuple" // DsLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",EmptyTuple" // DsDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",Scale" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",64" // MPerBlock
|
||||
",64" // NPerBlock
|
||||
",4" // K0PerBlock
|
||||
",8" // K1
|
||||
",32" // MPerXDL
|
||||
",32" // NPerXDL
|
||||
",2" // MXdlPerWave
|
||||
",2" // NXdlPerWave
|
||||
",Seq(1,4,8,2)" // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
",Seq(0,3,1,2)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,2,1,3)" // ABlockTransferSrcAccessOrder
|
||||
",2" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_K1
|
||||
",true" // ABlockLdsAddExtraM
|
||||
",Seq(1,4,8,2)" // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
",Seq(0,3,1,2)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,2,1,3)" // BBlockTransferSrcAccessOrder
|
||||
",2" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_K1
|
||||
",true" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMXdlPerWavePerShuffle
|
||||
",1" // CShuffleNXdlPerWavePerShuffle
|
||||
",Seq(1,16,1,4)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvMultipleDXdl)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from
|
||||
// device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default,
|
||||
ck::BlockGemmPipelineScheduler::Intrawave,
|
||||
ck::BlockGemmPipelineVersion::v1>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the first instance (BlockSize=32, MPerBlock=16, NPerBlock=16, etc.)
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",32" // BlockSize
|
||||
",16" // MPerBlock
|
||||
",16" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",1" // MRepeat
|
||||
",1" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",1" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_AK1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",1" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_BK1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,4,1,8)" // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",1" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",1" // NumGroupsToMerge
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // TransposeTransferSrcScalarPerVector
|
||||
",1" // TransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvTwoStageWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from
|
||||
// device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default,
|
||||
ck::BlockGemmPipelineScheduler::Intrawave,
|
||||
ck::BlockGemmPipelineVersion::v1>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the first instance in the tuple
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",16" // MPerBlock
|
||||
",16" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // K1
|
||||
",16" // MPerXDL
|
||||
",16" // NPerXDL
|
||||
",1" // MXdlPerWave
|
||||
",1" // NXdlPerWave
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",1" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_K1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",1" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_K1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMXdlPerWavePerShuffle
|
||||
",1" // CShuffleNXdlPerWavePerShuffle
|
||||
",Seq(1,8,1,8)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",1" // CBlockTransferScalarPerVector_NWaveNPerXdl
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",1" // NumGroupsToMerge
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // TransposeTransferSrcScalarPerVector
|
||||
",1" // TransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvTwoStageXdl)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// #ifdef _NOT_DEFINED_
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_wmma_c_shuffle_bf16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_operation::device::instance::NHWGC, // InLayout
|
||||
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
|
||||
ck::tensor_operation::device::instance::NHWGK, // OutLayout
|
||||
ck::tensor_operation::device::instance::ConvBwdWeightDefault>;
|
||||
|
||||
// Expected complete instance string
|
||||
std::string expected_str = "DeviceGroupedConvBwdWeight_Wmma_CShuffle"
|
||||
"<2" // NDimSpatial
|
||||
",NHWGC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",NHWGK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",32" // MPerBlock
|
||||
",32" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",2" // MRepeat
|
||||
",1" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",2" // ABlockTransferDstScalarPerVector_AK1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",2" // BBlockTransferDstScalarPerVector_BK1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,8,1,8)" // CShuffleBlockTransferClusterLengths
|
||||
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // MaxTransposeTransferSrcScalarPerVector
|
||||
",1" // MaxTransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
// Get the first instance from the tuple
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Test describe() through base class pointer for WMMA V3 variant
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// #endif
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// #ifdef _NOT_DEFINED_
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_v3_wmma_c_shuffle_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_operation::device::instance::NHWGC, // InLayout
|
||||
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
|
||||
ck::tensor_operation::device::instance::NHWGK, // OutLayout
|
||||
ck::tensor_operation::device::instance::ConvBwdWeightDefault>;
|
||||
|
||||
// Expected complete instance string
|
||||
std::string expected_str = "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",NHWGC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",NHWGK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",32" // MPerBlock
|
||||
",32" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",2" // MRepeat
|
||||
",1" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",2" // ABlockTransferDstScalarPerVector_AK1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",2" // BBlockTransferDstScalarPerVector_BK1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,8,1,8)" // CShuffleBlockTransferClusterLengths
|
||||
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // MaxTransposeTransferSrcScalarPerVector
|
||||
",1" // MaxTransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
// Get the first instance from the tuple
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Test describe() through base class pointer for WMMA V3 variant
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// #endif
|
||||
@@ -0,0 +1,86 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_operation::device::instance::GNHWC, // InLayout
|
||||
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
|
||||
ck::tensor_operation::device::instance::GNHWK, // OutLayout
|
||||
ck::tensor_operation::device::instance::ConvBwdWeightDefault, // ConvBwdWeightSpecialization
|
||||
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
|
||||
ck::BlockGemmPipelineVersion::v1>; // BlkGemmPipelineVer
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected complete instance string based on the generic instance
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeight_Xdl_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",32" // MPerBlock
|
||||
",32" // NPerBlock
|
||||
",32" // K0PerBlock
|
||||
",8" // K1
|
||||
",32" // MPerXDL
|
||||
",32" // NPerXDL
|
||||
",1" // MXdlPerWave
|
||||
",1" // NXdlPerWave
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",2" // ABlockTransferDstScalarPerVector_K1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",2" // BBlockTransferDstScalarPerVector_K1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMXdlPerWavePerShuffle
|
||||
",1" // CShuffleNXdlPerWavePerShuffle
|
||||
",Seq(1,8,1,8)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
">";
|
||||
|
||||
// Test describe() through base class pointer for XDL V3 variant
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvXdlV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
Reference in New Issue
Block a user