[CK_BUILDER] Add backward weight instance traits for xdl cshuffle. (#3143)

* Add backward weight instance traits for xdl cshuffle.

To keep instance test file sizes reasonable, we start a new test_bwd_weight_instances_traits.cpp test file.

* Fix copyright notices.

* Remove (c) symbol, replace with (C).

Having UTF-8 in source caused an error with code generation.

[ROCm/composable_kernel commit: 6dbee64886]
This commit is contained in:
John Shumway
2025-11-04 06:34:00 -08:00
committed by GitHub
parent 052c043d99
commit 52204ff4e5
6 changed files with 520 additions and 3 deletions

View File

@@ -0,0 +1,285 @@
// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "instance_traits.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
// Forward declaration to avoid circular dependency
namespace ck::tensor_operation::device {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1,
typename ABlockTransferThreadClusterArrangeOrder,
typename ABlockTransferSrcAccessOrder,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder,
typename BBlockTransferSrcAccessOrder,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
typename ComputeTypeA,
typename ComputeTypeB,
ck::index_t MaxTransposeTransferSrcScalarPerVector,
ck::index_t MaxTransposeTransferDstScalarPerVector>
struct DeviceGroupedConvBwdWeight_Xdl_CShuffle;
} // namespace ck::tensor_operation::device
namespace ck_tile {
namespace reflect {
template <ck::index_t NDimSpatial,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename InDataType_,
typename WeiDataType_,
typename OutDataType_,
typename AccDataType_,
typename InElementwiseOperation_,
typename WeiElementwiseOperation_,
typename OutElementwiseOperation_,
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
ConvBackwardWeightSpecialization,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t NPerBlock,
ck::index_t K0PerBlock,
ck::index_t K1,
ck::index_t MPerXDL,
ck::index_t NPerXDL,
ck::index_t MXdlPerWave,
ck::index_t NXdlPerWave,
typename ABlockTransferThreadClusterLengths_K0_M_K1_,
typename ABlockTransferThreadClusterArrangeOrder_,
typename ABlockTransferSrcAccessOrder_,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadClusterLengths_K0_N_K1_,
typename BBlockTransferThreadClusterArrangeOrder_,
typename BBlockTransferSrcAccessOrder_,
ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1,
bool BBlockLdsAddExtraN,
ck::index_t CShuffleMXdlPerWavePerShuffle,
ck::index_t CShuffleNXdlPerWavePerShuffle,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
typename ComputeTypeA_,
typename ComputeTypeB_,
ck::index_t MaxTransposeTransferSrcScalarPerVector,
ck::index_t MaxTransposeTransferDstScalarPerVector>
struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
NDimSpatial,
InLayout_,
WeiLayout_,
OutLayout_,
InDataType_,
WeiDataType_,
OutDataType_,
AccDataType_,
InElementwiseOperation_,
WeiElementwiseOperation_,
OutElementwiseOperation_,
ConvBackwardWeightSpecialization,
BlockSize,
MPerBlock,
NPerBlock,
K0PerBlock,
K1,
MPerXDL,
NPerXDL,
MXdlPerWave,
NXdlPerWave,
ABlockTransferThreadClusterLengths_K0_M_K1_,
ABlockTransferThreadClusterArrangeOrder_,
ABlockTransferSrcAccessOrder_,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
ABlockLdsAddExtraM,
BBlockTransferThreadClusterLengths_K0_N_K1_,
BBlockTransferThreadClusterArrangeOrder_,
BBlockTransferSrcAccessOrder_,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BBlockLdsAddExtraN,
CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
CBlockTransferScalarPerVector_NWaveNPerXdl,
ComputeTypeA_,
ComputeTypeB_,
MaxTransposeTransferSrcScalarPerVector,
MaxTransposeTransferDstScalarPerVector>>
{
static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Xdl_CShuffle";
static constexpr ck::index_t kNDimSpatial = NDimSpatial;
using InLayout = InLayout_;
using WeiLayout = WeiLayout_;
using OutLayout = OutLayout_;
using InDataType = InDataType_;
using WeiDataType = WeiDataType_;
using OutDataType = OutDataType_;
using AccDataType = AccDataType_;
using InElementwiseOperation = InElementwiseOperation_;
using WeiElementwiseOperation = WeiElementwiseOperation_;
using OutElementwiseOperation = OutElementwiseOperation_;
static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
static constexpr ck::index_t kBlockSize = BlockSize;
static constexpr ck::index_t kMPerBlock = MPerBlock;
static constexpr ck::index_t kNPerBlock = NPerBlock;
static constexpr ck::index_t kK0PerBlock = K0PerBlock;
static constexpr ck::index_t kK1 = K1;
static constexpr ck::index_t kMPerXDL = MPerXDL;
static constexpr ck::index_t kNPerXDL = NPerXDL;
static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_;
using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_;
static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
ABlockTransferSrcScalarPerVector;
static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
ABlockTransferDstScalarPerVector_K1;
static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_;
using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_;
static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
BBlockTransferSrcScalarPerVector;
static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
BBlockTransferDstScalarPerVector_K1;
static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
CBlockTransferScalarPerVector_NWaveNPerXdl;
using ComputeTypeA = ComputeTypeA_;
using ComputeTypeB = ComputeTypeB_;
static constexpr ck::index_t kMaxTransposeTransferSrcScalarPerVector =
MaxTransposeTransferSrcScalarPerVector;
static constexpr ck::index_t kMaxTransposeTransferDstScalarPerVector =
MaxTransposeTransferDstScalarPerVector;
// Static member function to generate instance string
static std::string instance_string()
{
std::ostringstream oss;
// Kernel type name
oss << "DeviceGroupedConvBwdWeight_Xdl_CShuffle";
// Template parameters in exact order
oss << "<" << kNDimSpatial; // 1. NDimSpatial
oss << "," << detail::layout_name<InLayout>(); // 2. InLayout
oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
oss << "," << detail::type_name<InDataType>(); // 5. InDataType
oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
oss << ","
<< detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
oss << ","
<< detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
// WeiElementwiseOperation
oss << ","
<< detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
// OutElementwiseOperation
oss << ","
<< detail::conv_bwd_weight_spec_name(
kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
oss << "," << kBlockSize; // 13. BlockSize
oss << "," << kMPerBlock; // 14. MPerBlock
oss << "," << kNPerBlock; // 15. NPerBlock
oss << "," << kK0PerBlock; // 16. K0PerBlock
oss << "," << kK1; // 17. K1
oss << "," << kMPerXDL; // 18. MPerXDL
oss << "," << kNPerXDL; // 19. NPerXDL
oss << "," << kMXdlPerWave; // 20. MXdlPerWave
oss << "," << kNXdlPerWave; // 21. NXdlPerWave
oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>(); // 23.
oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>(); // 24.
oss << "," << kABlockTransferSrcVectorDim; // 25.
oss << "," << kABlockTransferSrcScalarPerVector; // 26.
oss << "," << kABlockTransferDstScalarPerVector_K1; // 27.
oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>(); // 30.
oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>(); // 31.
oss << "," << kBBlockTransferSrcVectorDim; // 32.
oss << "," << kBBlockTransferSrcScalarPerVector; // 33.
oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34.
oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35.
oss << "," << kCShuffleMXdlPerWavePerShuffle; // 36.
oss << "," << kCShuffleNXdlPerWavePerShuffle; // 37.
oss << ","
<< detail::sequence_name<
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 39.
oss << "," << detail::type_name<ComputeTypeA>(); // 40.
oss << "," << detail::type_name<ComputeTypeB>(); // 41.
oss << "," << kMaxTransposeTransferSrcScalarPerVector; // 42.
oss << "," << kMaxTransposeTransferDstScalarPerVector; // 43.
oss << ">";
return oss.str();
}
};
} // namespace reflect
} // namespace ck_tile

View File

@@ -1,5 +1,5 @@
// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
// Utility functions and helpers for instance_traits.hpp
// Contains helper functions to convert types, enums, and sequences to string representations.
@@ -21,6 +21,7 @@
#include <ck_tile/ops/common/tensor_layout.hpp>
#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
#include <ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp>
#include <ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp>
#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
namespace ck_tile::reflect::detail {
@@ -112,6 +113,20 @@ conv_fwd_spec_name(ck::tensor_operation::device::ConvolutionForwardSpecializatio
}
}
// Convert ConvolutionBackwardWeightSpecialization enum to string
constexpr std::string_view conv_bwd_weight_spec_name(
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization spec)
{
using enum ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization;
switch(spec)
{
case Default: return "Default";
case Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
case Filter1x1Pad0: return "Filter1x1Pad0";
case OddC: return "OddC";
}
}
// Convert GemmSpecialization enum to string
constexpr std::string_view gemm_spec_name(ck::tensor_operation::device::GemmSpecialization spec)
{

View File

@@ -20,6 +20,7 @@ endfunction()
add_ck_builder_test(test_ckb_conv_builder
test_conv_builder.cpp
test_fwd_instance_traits.cpp
test_bwd_weight_instance_traits.cpp
test_instance_traits_util.cpp)
add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
@@ -30,7 +31,8 @@ add_ck_builder_test(test_ckb_get_instance_string
test_get_instance_string_fwd_grp_conv.cpp
test_get_instance_string_fwd_grp_conv_large_tensor.cpp
test_get_instance_string_fwd_grp_conv_wmma.cpp
test_get_instance_string_fwd_grp_conv_dl.cpp)
test_get_instance_string_fwd_grp_conv_dl.cpp
test_get_instance_string_bwd_weight_grp_conv_xdl.cpp)
# Testing the fwd convolution builder requires kernel compilation.
# To enable parallel compilation, the individual tests are split into separate files.

View File

@@ -0,0 +1,112 @@
// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck/ck.hpp>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp>
namespace {
TEST(InstanceTraits, BwdWeightXdlCShuffleInstanceStringReturnsCorrectFormat)
{
using DeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
2, // NDimSpatial
ck::tensor_layout::convolution::GNHWC, // InLayout
ck::tensor_layout::convolution::GKYXC, // WeiLayout
ck::tensor_layout::convolution::GNHWK, // OutLayout
ck::half_t, // InDataType
ck::half_t, // WeiDataType
ck::half_t, // OutDataType
float, // AccDataType
ck::tensor_operation::element_wise::PassThrough, // InElementwiseOperation
ck::tensor_operation::element_wise::PassThrough, // WeiElementwiseOperation
ck::tensor_operation::element_wise::PassThrough, // OutElementwiseOperation
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::
Default, // ConvBackwardWeightSpecialization
256, // BlockSize
128, // MPerBlock
128, // NPerBlock
4, // K0PerBlock
8, // K1
32, // MPerXDL
32, // NPerXDL
2, // MXdlPerWave
2, // NXdlPerWave
ck::Sequence<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
ck::Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
ck::Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1
false, // ABlockLdsAddExtraM
ck::Sequence<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
ck::Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
ck::Sequence<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1
false, // BBlockLdsAddExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
ck::Sequence<1,
32,
1,
8>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8, // CBlockTransferScalarPerVector_NWaveNPerXdl
ck::half_t, // ComputeTypeA
ck::half_t, // ComputeTypeB
1, // MaxTransposeTransferSrcScalarPerVector
1>; // MaxTransposeTransferDstScalarPerVector
std::string instance_str = ck_tile::reflect::instance_string<DeviceInstance>();
std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",256" // BlockSize
",128" // MPerBlock
",128" // NPerBlock
",4" // K0PerBlock
",8" // K1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(4,64,1)" // ABlockTransferThreadClusterLengths_K0_M_K1
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",8" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_K1
",false" // ABlockLdsAddExtraM
",Seq(4,64,1)" // BBlockTransferThreadClusterLengths_K0_N_K1
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",8" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_K1
",false" // BBlockLdsAddExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,32,1,8)" // CBlockTransferClusterLengths
",8" // CBlockTransferScalarPerVector_NWaveNPerXdl
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // MaxTransposeTransferSrcScalarPerVector
",1>"; // MaxTransposeTransferDstScalarPerVector
EXPECT_EQ(instance_str, expected_str);
}
} // anonymous namespace

View File

@@ -0,0 +1,86 @@
// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp>
// Test GetInstanceString through base class pointer for backward weight XDL variant
TEST(GetInstanceString, ReturnsStringForBwdWeightGrpConvXdlInstance)
{
// Use the template helper to get a working instance configuration
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::GNHWK, // OutLayout
ck::tensor_operation::device::instance::
ConvBwdWeightDefault>; // ConvBwdWeightSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Define the base class type using the most general operator base
using BaseClass = ck::tensor_operation::device::BaseOperator;
// Create an instance of the derived class
DeviceInstance device_instance;
// Get a pointer to the base class
BaseClass* base_ptr = &device_instance;
// Call GetInstanceString through the base class pointer
std::string instance_str = base_ptr->GetInstanceString();
// Expected complete instance string based on the first instance from
// device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances
// This corresponds to the configuration with BlockSize=64, MPerBlock=64, NPerBlock=64, etc.
std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",4" // K0PerBlock
",8" // K1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(1,4,8,2)" // ABlockTransferThreadClusterLengths_K0_M_K1
",Seq(0,3,1,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",4" // ABlockTransferDstScalarPerVector_K1
",true" // ABlockLdsAddExtraM
",Seq(1,4,8,2)" // BBlockTransferThreadClusterLengths_K0_N_K1
",Seq(0,3,1,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",4" // BBlockTransferDstScalarPerVector_K1
",true" // BBlockLdsAddExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CBlockTransferClusterLengths
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // MaxTransposeTransferSrcScalarPerVector
",1>"; // MaxTransposeTransferDstScalarPerVector
EXPECT_EQ(instance_str, expected_str);
}

View File

@@ -1,5 +1,5 @@
// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
@@ -24,6 +24,10 @@
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#ifdef CK_EXPERIMENTAL_BUILDER
#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
#endif
namespace ck {
namespace tensor_operation {
namespace device {
@@ -1225,6 +1229,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
return str.str();
}
#ifdef CK_EXPERIMENTAL_BUILDER
std::string GetInstanceString() const override
{
static_assert(ck_tile::reflect::HasInstanceTraits<DeviceOp>,
"Specialization of instance_traits not found. Please check that a "
"specialization exists in file "
"ck_tile/builder/reflect/"
"instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp "
"for the given template parameters.");
return ck_tile::reflect::instance_string<DeviceOp>();
}
#endif
size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
{
auto arg = dynamic_cast<const Argument*>(p_arg);