From 52204ff4e5a3ee337de1ebb9d7edbd7a2dea2b34 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Tue, 4 Nov 2025 06:34:00 -0800 Subject: [PATCH] [CK_BUILDER] Add backward weight instance traits for xdl cshuffle. (#3143) * Add backward weight instance traits for xdl cshuffle. To keep instance test file sizes reasonable, we start a new test_bwd_weight_instances_traits.cpp test file. * Fix copyright notices. * Remove (c) symbol, replace with (C). Having UTF-8 in source caused an error with code generation. [ROCm/composable_kernel commit: 6dbee64886c84e9777a9c49bfc46d8947f8009a3] --- ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 285 ++++++++++++++++++ .../builder/reflect/instance_traits_util.hpp | 17 +- experimental/builder/test/CMakeLists.txt | 4 +- .../test/test_bwd_weight_instance_traits.cpp | 112 +++++++ ...nstance_string_bwd_weight_grp_conv_xdl.cpp | 86 ++++++ ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 19 +- 6 files changed, 520 insertions(+), 3 deletions(-) create mode 100644 experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp create mode 100644 experimental/builder/test/test_bwd_weight_instance_traits.cpp create mode 100644 experimental/builder/test/test_get_instance_string_bwd_weight_grp_conv_xdl.cpp diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp new file mode 100644 index 0000000000..a0def3e5d9 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -0,0 +1,285 @@ +// Copyright (C) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "instance_traits.hpp" +#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" + +// Forward declaration to avoid circular dependency +namespace ck::tensor_operation::device { + +template +struct DeviceGroupedConvBwdWeight_Xdl_CShuffle; + +} // namespace ck::tensor_operation::device + +namespace ck_tile { +namespace reflect { + +template +struct InstanceTraits> +{ + static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"; + + static constexpr ck::index_t kNDimSpatial = NDimSpatial; + + using InLayout = InLayout_; + using WeiLayout = WeiLayout_; + using OutLayout = OutLayout_; + + using InDataType = InDataType_; + using WeiDataType = WeiDataType_; + using OutDataType = OutDataType_; + using AccDataType = AccDataType_; + + using InElementwiseOperation = InElementwiseOperation_; + using WeiElementwiseOperation = WeiElementwiseOperation_; + using OutElementwiseOperation = OutElementwiseOperation_; + + static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization; + + static constexpr ck::index_t kBlockSize = BlockSize; + static constexpr ck::index_t kMPerBlock = MPerBlock; + static constexpr ck::index_t kNPerBlock = NPerBlock; + static constexpr ck::index_t kK0PerBlock = K0PerBlock; + static constexpr ck::index_t kK1 = K1; + static constexpr ck::index_t kMPerXDL = MPerXDL; + static constexpr ck::index_t kNPerXDL = NPerXDL; + static constexpr ck::index_t kMXdlPerWave = MXdlPerWave; + static constexpr ck::index_t kNXdlPerWave = NXdlPerWave; + + using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_; + using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_; + using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_; + static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim; + static constexpr ck::index_t kABlockTransferSrcScalarPerVector = + ABlockTransferSrcScalarPerVector; + static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 = + ABlockTransferDstScalarPerVector_K1; + static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM; + + using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_; + using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_; + using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_; + static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim; + static constexpr ck::index_t kBBlockTransferSrcScalarPerVector = + BBlockTransferSrcScalarPerVector; + static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 = + BBlockTransferDstScalarPerVector_K1; + static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN; + + static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle; + static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle; + + using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = + CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_; + static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl = + CBlockTransferScalarPerVector_NWaveNPerXdl; + + using ComputeTypeA = ComputeTypeA_; + using ComputeTypeB = ComputeTypeB_; + + static constexpr ck::index_t kMaxTransposeTransferSrcScalarPerVector = + MaxTransposeTransferSrcScalarPerVector; + static constexpr ck::index_t kMaxTransposeTransferDstScalarPerVector = + MaxTransposeTransferDstScalarPerVector; + + // Static member function to generate instance string + static std::string instance_string() + { + std::ostringstream oss; + + // Kernel type name + oss << "DeviceGroupedConvBwdWeight_Xdl_CShuffle"; + + // Template parameters in exact order + oss << "<" << kNDimSpatial; // 1. NDimSpatial + oss << "," << detail::layout_name(); // 2. InLayout + oss << "," << detail::layout_name(); // 3. WeiLayout + oss << "," << detail::layout_name(); // 4. OutLayout + oss << "," << detail::type_name(); // 5. InDataType + oss << "," << detail::type_name(); // 6. WeiDataType + oss << "," << detail::type_name(); // 7. OutDataType + oss << "," << detail::type_name(); // 8. AccDataType + oss << "," + << detail::elementwise_op_name(); // 9. InElementwiseOperation + oss << "," + << detail::elementwise_op_name(); // 10. + // WeiElementwiseOperation + oss << "," + << detail::elementwise_op_name(); // 11. + // OutElementwiseOperation + oss << "," + << detail::conv_bwd_weight_spec_name( + kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization + oss << "," << kBlockSize; // 13. BlockSize + oss << "," << kMPerBlock; // 14. MPerBlock + oss << "," << kNPerBlock; // 15. NPerBlock + oss << "," << kK0PerBlock; // 16. K0PerBlock + oss << "," << kK1; // 17. K1 + oss << "," << kMPerXDL; // 18. MPerXDL + oss << "," << kNPerXDL; // 19. NPerXDL + oss << "," << kMXdlPerWave; // 20. MXdlPerWave + oss << "," << kNXdlPerWave; // 21. NXdlPerWave + oss << "," << detail::sequence_name(); // 22. + oss << "," << detail::sequence_name(); // 23. + oss << "," << detail::sequence_name(); // 24. + oss << "," << kABlockTransferSrcVectorDim; // 25. + oss << "," << kABlockTransferSrcScalarPerVector; // 26. + oss << "," << kABlockTransferDstScalarPerVector_K1; // 27. + oss << "," << (kABlockLdsAddExtraM ? "true" : "false"); // 28. + oss << "," << detail::sequence_name(); // 29. + oss << "," << detail::sequence_name(); // 30. + oss << "," << detail::sequence_name(); // 31. + oss << "," << kBBlockTransferSrcVectorDim; // 32. + oss << "," << kBBlockTransferSrcScalarPerVector; // 33. + oss << "," << kBBlockTransferDstScalarPerVector_K1; // 34. + oss << "," << (kBBlockLdsAddExtraN ? "true" : "false"); // 35. + oss << "," << kCShuffleMXdlPerWavePerShuffle; // 36. + oss << "," << kCShuffleNXdlPerWavePerShuffle; // 37. + oss << "," + << detail::sequence_name< + CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38. + oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl; // 39. + oss << "," << detail::type_name(); // 40. + oss << "," << detail::type_name(); // 41. + oss << "," << kMaxTransposeTransferSrcScalarPerVector; // 42. + oss << "," << kMaxTransposeTransferDstScalarPerVector; // 43. + oss << ">"; + + return oss.str(); + } +}; + +} // namespace reflect +} // namespace ck_tile diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index b13675a7b9..95d1c94de4 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -1,5 +1,5 @@ +// Copyright (C) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. // Utility functions and helpers for instance_traits.hpp // Contains helper functions to convert types, enums, and sequences to string representations. @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace ck_tile::reflect::detail { @@ -112,6 +113,20 @@ conv_fwd_spec_name(ck::tensor_operation::device::ConvolutionForwardSpecializatio } } +// Convert ConvolutionBackwardWeightSpecialization enum to string +constexpr std::string_view conv_bwd_weight_spec_name( + ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization spec) +{ + using enum ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization; + switch(spec) + { + case Default: return "Default"; + case Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0"; + case Filter1x1Pad0: return "Filter1x1Pad0"; + case OddC: return "OddC"; + } +} + // Convert GemmSpecialization enum to string constexpr std::string_view gemm_spec_name(ck::tensor_operation::device::GemmSpecialization spec) { diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 26a666a805..8b5c4519a9 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -20,6 +20,7 @@ endfunction() add_ck_builder_test(test_ckb_conv_builder test_conv_builder.cpp test_fwd_instance_traits.cpp + test_bwd_weight_instance_traits.cpp test_instance_traits_util.cpp) add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp) @@ -30,7 +31,8 @@ add_ck_builder_test(test_ckb_get_instance_string test_get_instance_string_fwd_grp_conv.cpp test_get_instance_string_fwd_grp_conv_large_tensor.cpp test_get_instance_string_fwd_grp_conv_wmma.cpp - test_get_instance_string_fwd_grp_conv_dl.cpp) + test_get_instance_string_fwd_grp_conv_dl.cpp + test_get_instance_string_bwd_weight_grp_conv_xdl.cpp) # Testing the fwd convolution builder requires kernel compilation. # To enable parallel compilation, the individual tests are split into separate files. diff --git a/experimental/builder/test/test_bwd_weight_instance_traits.cpp b/experimental/builder/test/test_bwd_weight_instance_traits.cpp new file mode 100644 index 0000000000..24c28c2b9d --- /dev/null +++ b/experimental/builder/test/test_bwd_weight_instance_traits.cpp @@ -0,0 +1,112 @@ +// Copyright (C) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include + +namespace { + +TEST(InstanceTraits, BwdWeightXdlCShuffleInstanceStringReturnsCorrectFormat) +{ + using DeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle< + 2, // NDimSpatial + ck::tensor_layout::convolution::GNHWC, // InLayout + ck::tensor_layout::convolution::GKYXC, // WeiLayout + ck::tensor_layout::convolution::GNHWK, // OutLayout + ck::half_t, // InDataType + ck::half_t, // WeiDataType + ck::half_t, // OutDataType + float, // AccDataType + ck::tensor_operation::element_wise::PassThrough, // InElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // WeiElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // OutElementwiseOperation + ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization:: + Default, // ConvBackwardWeightSpecialization + 256, // BlockSize + 128, // MPerBlock + 128, // NPerBlock + 4, // K0PerBlock + 8, // K1 + 32, // MPerXDL + 32, // NPerXDL + 2, // MXdlPerWave + 2, // NXdlPerWave + ck::Sequence<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + ck::Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + ck::Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_K1 + false, // ABlockLdsAddExtraM + ck::Sequence<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + ck::Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + ck::Sequence<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_K1 + false, // BBlockLdsAddExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + ck::Sequence<1, + 32, + 1, + 8>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + 8, // CBlockTransferScalarPerVector_NWaveNPerXdl + ck::half_t, // ComputeTypeA + ck::half_t, // ComputeTypeB + 1, // MaxTransposeTransferSrcScalarPerVector + 1>; // MaxTransposeTransferDstScalarPerVector + + std::string instance_str = ck_tile::reflect::instance_string(); + + std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle" + "<2" // NDimSpatial + ",GNHWC" // InLayout + ",GKYXC" // WeiLayout + ",GNHWK" // OutLayout + ",fp16" // InDataType + ",fp16" // WeiDataType + ",fp16" // OutDataType + ",fp32" // AccDataType + ",PassThrough" // InElementwiseOperation + ",PassThrough" // WeiElementwiseOperation + ",PassThrough" // OutElementwiseOperation + ",Default" // ConvBackwardWeightSpecialization + ",256" // BlockSize + ",128" // MPerBlock + ",128" // NPerBlock + ",4" // K0PerBlock + ",8" // K1 + ",32" // MPerXDL + ",32" // NPerXDL + ",2" // MXdlPerWave + ",2" // NXdlPerWave + ",Seq(4,64,1)" // ABlockTransferThreadClusterLengths_K0_M_K1 + ",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder + ",Seq(1,0,2)" // ABlockTransferSrcAccessOrder + ",2" // ABlockTransferSrcVectorDim + ",8" // ABlockTransferSrcScalarPerVector + ",8" // ABlockTransferDstScalarPerVector_K1 + ",false" // ABlockLdsAddExtraM + ",Seq(4,64,1)" // BBlockTransferThreadClusterLengths_K0_N_K1 + ",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder + ",Seq(1,0,2)" // BBlockTransferSrcAccessOrder + ",2" // BBlockTransferSrcVectorDim + ",8" // BBlockTransferSrcScalarPerVector + ",8" // BBlockTransferDstScalarPerVector_K1 + ",false" // BBlockLdsAddExtraN + ",1" // CShuffleMXdlPerWavePerShuffle + ",1" // CShuffleNXdlPerWavePerShuffle + ",Seq(1,32,1,8)" // CBlockTransferClusterLengths + ",8" // CBlockTransferScalarPerVector_NWaveNPerXdl + ",fp16" // ComputeTypeA + ",fp16" // ComputeTypeB + ",1" // MaxTransposeTransferSrcScalarPerVector + ",1>"; // MaxTransposeTransferDstScalarPerVector + + EXPECT_EQ(instance_str, expected_str); +} + +} // anonymous namespace diff --git a/experimental/builder/test/test_get_instance_string_bwd_weight_grp_conv_xdl.cpp b/experimental/builder/test/test_get_instance_string_bwd_weight_grp_conv_xdl.cpp new file mode 100644 index 0000000000..68b43c6a99 --- /dev/null +++ b/experimental/builder/test/test_get_instance_string_bwd_weight_grp_conv_xdl.cpp @@ -0,0 +1,86 @@ +// Copyright (C) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include + +// Test GetInstanceString through base class pointer for backward weight XDL variant +TEST(GetInstanceString, ReturnsStringForBwdWeightGrpConvXdlInstance) +{ + // Use the template helper to get a working instance configuration + using InstanceTuple = ck::tensor_operation::device::instance:: + device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances< + 2, // NDimSpatial + ck::tensor_operation::device::instance::GNHWC, // InLayout + ck::tensor_operation::device::instance::GKYXC, // WeiLayout + ck::tensor_operation::device::instance::GNHWK, // OutLayout + ck::tensor_operation::device::instance:: + ConvBwdWeightDefault>; // ConvBwdWeightSpecialization + + // Get the first instance from the tuple + using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type; + + // Define the base class type using the most general operator base + using BaseClass = ck::tensor_operation::device::BaseOperator; + + // Create an instance of the derived class + DeviceInstance device_instance; + + // Get a pointer to the base class + BaseClass* base_ptr = &device_instance; + + // Call GetInstanceString through the base class pointer + std::string instance_str = base_ptr->GetInstanceString(); + + // Expected complete instance string based on the first instance from + // device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances + // This corresponds to the configuration with BlockSize=64, MPerBlock=64, NPerBlock=64, etc. + std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle" + "<2" // NDimSpatial + ",GNHWC" // InLayout + ",GKYXC" // WeiLayout + ",GNHWK" // OutLayout + ",fp16" // InDataType + ",fp16" // WeiDataType + ",fp16" // OutDataType + ",fp32" // AccDataType + ",PassThrough" // InElementwiseOperation + ",PassThrough" // WeiElementwiseOperation + ",PassThrough" // OutElementwiseOperation + ",Default" // ConvBackwardWeightSpecialization + ",64" // BlockSize + ",64" // MPerBlock + ",64" // NPerBlock + ",4" // K0PerBlock + ",8" // K1 + ",32" // MPerXDL + ",32" // NPerXDL + ",2" // MXdlPerWave + ",2" // NXdlPerWave + ",Seq(1,4,8,2)" // ABlockTransferThreadClusterLengths_K0_M_K1 + ",Seq(0,3,1,2)" // ABlockTransferThreadClusterArrangeOrder + ",Seq(0,2,1,3)" // ABlockTransferSrcAccessOrder + ",2" // ABlockTransferSrcVectorDim + ",2" // ABlockTransferSrcScalarPerVector + ",4" // ABlockTransferDstScalarPerVector_K1 + ",true" // ABlockLdsAddExtraM + ",Seq(1,4,8,2)" // BBlockTransferThreadClusterLengths_K0_N_K1 + ",Seq(0,3,1,2)" // BBlockTransferThreadClusterArrangeOrder + ",Seq(0,2,1,3)" // BBlockTransferSrcAccessOrder + ",2" // BBlockTransferSrcVectorDim + ",2" // BBlockTransferSrcScalarPerVector + ",4" // BBlockTransferDstScalarPerVector_K1 + ",true" // BBlockLdsAddExtraN + ",1" // CShuffleMXdlPerWavePerShuffle + ",1" // CShuffleNXdlPerWavePerShuffle + ",Seq(1,16,1,4)" // CBlockTransferClusterLengths + ",2" // CBlockTransferScalarPerVector_NWaveNPerXdl + ",fp16" // ComputeTypeA + ",fp16" // ComputeTypeB + ",1" // MaxTransposeTransferSrcScalarPerVector + ",1>"; // MaxTransposeTransferDstScalarPerVector + + EXPECT_EQ(instance_str, expected_str); +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp index c7ee3e9ecf..650c6f11d3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -1,5 +1,5 @@ +// Copyright (C) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -24,6 +24,10 @@ #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" +#ifdef CK_EXPERIMENTAL_BUILDER +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp" +#endif + namespace ck { namespace tensor_operation { namespace device { @@ -1225,6 +1229,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle return str.str(); } +#ifdef CK_EXPERIMENTAL_BUILDER + std::string GetInstanceString() const override + { + static_assert(ck_tile::reflect::HasInstanceTraits, + "Specialization of instance_traits not found. Please check that a " + "specialization exists in file " + "ck_tile/builder/reflect/" + "instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp " + "for the given template parameters."); + return ck_tile::reflect::instance_string(); + } +#endif + size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override { auto arg = dynamic_cast(p_arg);