diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp new file mode 100644 index 0000000000..54ba224f7f --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +// InstanceTraits specialization for DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK +// +// CRITICAL MAINTENANCE NOTE: +// This InstanceTraits file MUST be kept strictly in sync with the device implementation header: +// ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp +// "In sync" means that the template parameter order, names, and types in the declaration below +// MUST EXACTLY MATCH those in the device implementation. If these diverge, you may encounter +// compilation errors, subtle template instantiation mismatches, or silent runtime bugs that are +// difficult to diagnose. Always update both files together and review changes carefully. + +#pragma once + +#include "instance_traits.hpp" + +// Forward declaration to avoid circular dependency. +// This file will be included by the device implementation header, so we cannot include +// the implementation header here. We only need the template signature to pattern-match +// on template parameters - we don't need any implementation details. + +namespace ck::tensor_operation::device { + +template +struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK; + +} // namespace ck::tensor_operation::device + +namespace ck_tile::reflect { + +// Specialization for DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK +template +struct InstanceTraits> +{ + // Spatial dimension + static constexpr int kSpatialDim = NDimSpatial; + + // Data types + using ADataType = ADataType_; + using BDataType = BDataType_; + using DsDataType = DsDataType_; + using EDataType = EDataType_; + using AccDataType = AccDataType_; + + // Layout types + using ALayout = ALayout_; + using BLayout = BLayout_; + using DsLayout = DsLayout_; + using ELayout = ELayout_; + + // Element-wise operations + using AElementwiseOperation = AElementwiseOperation_; + using BElementwiseOperation = BElementwiseOperation_; + using CDEElementwiseOperation = CDEElementwiseOperation_; + + // Specialization + static constexpr ck::tensor_operation::device::ConvolutionForwardSpecialization + kConvForwardSpecialization = ConvForwardSpecialization; + static constexpr ck::tensor_operation::device::GemmSpecialization kGemmSpecialization = + GemmSpec; + + // Block configuration + static constexpr int kBlockSize = BlockSize; + static constexpr int kMPerBlock = MPerBlock; + static constexpr int kNPerBlock = NPerBlock; + static constexpr int kK0PerBlock = K0PerBlock; + + // Tuning parameters + static constexpr int kK1 = K1; + static constexpr int kM1PerThread = M1PerThread; + static constexpr int kN1PerThread = N1PerThread; + static constexpr int kKPerThread = KPerThread; + + // Thread cluster configurations + using M1N1ThreadClusterM1Xs = M1N1ThreadClusterM1Xs_; + using M1N1ThreadClusterN1Xs = M1N1ThreadClusterN1Xs_; + + // A block transfer parameters + using ABlockTransferThreadSliceLengths_K0_M0_M1_K1 = + ABlockTransferThreadSliceLengths_K0_M0_M1_K1_; + using ABlockTransferThreadClusterLengths_K0_M0_M1_K1 = + ABlockTransferThreadClusterLengths_K0_M0_M1_K1_; + using ABlockTransferThreadClusterArrangeOrder = ABlockTransferThreadClusterArrangeOrder_; + using ABlockTransferSrcAccessOrder = ABlockTransferSrcAccessOrder_; + using ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 = + ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1_; + using ABlockTransferSrcVectorTensorContiguousDimOrder = + ABlockTransferSrcVectorTensorContiguousDimOrder_; + using ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 = + ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1_; + + // B block transfer parameters + using BBlockTransferThreadSliceLengths_K0_N0_N1_K1 = + BBlockTransferThreadSliceLengths_K0_N0_N1_K1_; + using BBlockTransferThreadClusterLengths_K0_N0_N1_K1 = + BBlockTransferThreadClusterLengths_K0_N0_N1_K1_; + using BBlockTransferThreadClusterArrangeOrder = BBlockTransferThreadClusterArrangeOrder_; + using BBlockTransferSrcAccessOrder = BBlockTransferSrcAccessOrder_; + using BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 = + BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1_; + using BBlockTransferSrcVectorTensorContiguousDimOrder = + BBlockTransferSrcVectorTensorContiguousDimOrder_; + using BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = + BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1_; + + // C thread transfer parameters + using CThreadTransferSrcDstAccessOrder = CThreadTransferSrcDstAccessOrder_; + static constexpr int kCThreadTransferSrcDstVectorDim = CThreadTransferSrcDstVectorDim; + static constexpr int kCThreadTransferDstScalarPerVector = CThreadTransferDstScalarPerVector; + + // Static member function to generate instance string + static std::string instance_string() + { + std::ostringstream oss; + + // Kernel type name + oss << "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK"; + + // Template parameters in exact order matching the device implementation + oss << "<" << kSpatialDim; // 1. NDimSpatial + oss << "," << detail::type_name(); // 2. ADataType + oss << "," << detail::type_name(); // 3. BDataType + oss << "," << detail::tuple_name(); // 4. DsDataType + oss << "," << detail::type_name(); // 5. EDataType + oss << "," << detail::type_name(); // 6. AccDataType + oss << "," << detail::layout_name(); // 7. ALayout + oss << "," << detail::layout_name(); // 8. BLayout + oss << "," << detail::tuple_name(); // 9. DsLayout + oss << "," << detail::layout_name(); // 10. ELayout + oss << "," + << detail::elementwise_op_name(); // 11. AElementwiseOperation + oss << "," + << detail::elementwise_op_name(); // 12. BElementwiseOperation + oss << "," + << detail::elementwise_op_name(); // 13. + // CDEElementwiseOperation + oss << "," + << detail::conv_fwd_spec_name( + kConvForwardSpecialization); // 14. ConvForwardSpecialization + oss << "," << detail::gemm_spec_name(kGemmSpecialization); // 15. GemmSpec + oss << "," << kBlockSize; // 16. BlockSize + oss << "," << kMPerBlock; // 17. MPerBlock + oss << "," << kNPerBlock; // 18. NPerBlock + oss << "," << kK0PerBlock; // 19. K0PerBlock + oss << "," << kK1; // 20. K1 + oss << "," << kM1PerThread; // 21. M1PerThread + oss << "," << kN1PerThread; // 22. N1PerThread + oss << "," << kKPerThread; // 23. KPerThread + oss << "," << detail::sequence_name(); // 24. M1N1ThreadClusterM1Xs + oss << "," << detail::sequence_name(); // 25. M1N1ThreadClusterN1Xs + oss << "," + << detail::sequence_name< + ABlockTransferThreadSliceLengths_K0_M0_M1_K1>(); // 26. + // ABlockTransferThreadSliceLengths_K0_M0_M1_K1 + oss << "," + << detail::sequence_name< + ABlockTransferThreadClusterLengths_K0_M0_M1_K1>(); // 27. + // ABlockTransferThreadClusterLengths_K0_M0_M1_K1 + oss << "," + << detail::sequence_name< + ABlockTransferThreadClusterArrangeOrder>(); // 28. + // ABlockTransferThreadClusterArrangeOrder + oss << "," + << detail::sequence_name< + ABlockTransferSrcAccessOrder>(); // 29. ABlockTransferSrcAccessOrder + oss << "," + << detail::sequence_name< + ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1>(); // 30. + // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 + oss << "," + << detail::sequence_name< + ABlockTransferSrcVectorTensorContiguousDimOrder>(); // 31. + // ABlockTransferSrcVectorTensorContiguousDimOrder + oss << "," + << detail::sequence_name< + ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1>(); // 32. + // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 + oss << "," + << detail::sequence_name< + BBlockTransferThreadSliceLengths_K0_N0_N1_K1>(); // 33. + // BBlockTransferThreadSliceLengths_K0_N0_N1_K1 + oss << "," + << detail::sequence_name< + BBlockTransferThreadClusterLengths_K0_N0_N1_K1>(); // 34. + // BBlockTransferThreadClusterLengths_K0_N0_N1_K1 + oss << "," + << detail::sequence_name< + BBlockTransferThreadClusterArrangeOrder>(); // 35. + // BBlockTransferThreadClusterArrangeOrder + oss << "," + << detail::sequence_name< + BBlockTransferSrcAccessOrder>(); // 36. BBlockTransferSrcAccessOrder + oss << "," + << detail::sequence_name< + BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1>(); // 37. + // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 + oss << "," + << detail::sequence_name< + BBlockTransferSrcVectorTensorContiguousDimOrder>(); // 38. + // BBlockTransferSrcVectorTensorContiguousDimOrder + oss << "," + << detail::sequence_name< + BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1>(); // 39. + // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 + oss << "," + << detail::sequence_name< + CThreadTransferSrcDstAccessOrder>(); // 40. CThreadTransferSrcDstAccessOrder + oss << "," << kCThreadTransferSrcDstVectorDim; // 41. CThreadTransferSrcDstVectorDim + oss << "," << kCThreadTransferDstScalarPerVector; // 42. CThreadTransferDstScalarPerVector + oss << ">"; + + return oss.str(); + } +}; + +} // namespace ck_tile::reflect diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 462269884e..346b731c33 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -32,8 +32,8 @@ template struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle; @@ -269,17 +269,17 @@ struct InstanceTraits(); // 2. ALayout - oss << "," << detail::layout_name(); // 3. BLayout - oss << "," << detail::tuple_name(); // 4. DsLayout - oss << "," << detail::layout_name(); // 5. ELayout - oss << "," << detail::type_name(); // 6. ADataType - oss << "," << detail::type_name(); // 7. BDataType - oss << "," << detail::type_name(); // 8. AccDataType - oss << "," << detail::type_name(); // 9. CShuffleDataType - oss << "," << detail::tuple_name(); // 10. DsDataType - oss << "," << detail::type_name(); // 11. EDataType + oss << "<" << kSpatialDim; // 1. NDimSpatial + oss << "," << detail::layout_name(); // 2. ALayout + oss << "," << detail::layout_name(); // 3. BLayout + oss << "," << detail::tuple_name(); // 4. DsLayout + oss << "," << detail::layout_name(); // 5. ELayout + oss << "," << detail::type_or_type_tuple_name(); // 6. ADataType + oss << "," << detail::type_or_type_tuple_name(); // 7. BDataType + oss << "," << detail::type_name(); // 8. AccDataType + oss << "," << detail::type_name(); // 9. CShuffleDataType + oss << "," << detail::tuple_name(); // 10. DsDataType + oss << "," << detail::type_name(); // 11. EDataType oss << "," << detail::elementwise_op_name(); // 12. AElementwiseOperation oss << "," diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp index 9ab827e3a5..5784938fc6 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp @@ -22,7 +22,7 @@ // on template parameters - we don't need any implementation details. namespace ck::tensor_operation::device { -template +struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle; + +} // namespace ck::tensor_operation::device + +namespace ck_tile::reflect { + +// Specialization for DeviceGroupedConvFwdMultipleD_Wmma_CShuffle +template +struct InstanceTraits> +{ + // Spatial dimension + static constexpr int kSpatialDim = NDimSpatial; + + // Layout types + using ALayout = ALayout_; + using BLayout = BLayout_; + using DsLayout = DsLayout_; + using ELayout = ELayout_; + + // Data types + using ADataType = ADataType_; + using BDataType = BDataType_; + using AccDataType = AccDataType_; + using CShuffleDataType = CShuffleDataType_; + using DsDataType = DsDataType_; + using EDataType = EDataType_; + + // Element-wise operations + using AElementwiseOperation = AElementwiseOperation_; + using BElementwiseOperation = BElementwiseOperation_; + using CDEElementwiseOperation = CDEElementwiseOperation_; + + // Specialization + static constexpr ck::tensor_operation::device::ConvolutionForwardSpecialization + kConvForwardSpecialization = ConvForwardSpecialization; + static constexpr ck::tensor_operation::device::GemmSpecialization kGemmSpecialization = + GemmSpec; + + // Prefetch stage + static constexpr int kNumGemmKPrefetchStage = NumGemmKPrefetchStage; + + // Block configuration + static constexpr int kBlockSize = BlockSize; + static constexpr int kMPerBlock = MPerBlock; + static constexpr int kNPerBlock = NPerBlock; + static constexpr int kKPerBlock = KPerBlock; + + // Tuning parameters + static constexpr int kK1 = K1; + static constexpr int kMPerWmma = MPerWmma; + static constexpr int kNPerWmma = NPerWmma; + static constexpr int kMRepeat = MRepeat; + static constexpr int kNRepeat = NRepeat; + + // A block transfer thread cluster dimensions (converted to std::array) + static constexpr auto kAThreadClusterLengths = + detail::SequenceToArray::value; + static constexpr auto kAThreadClusterArrangeOrder = + detail::SequenceToArray::value; + static constexpr auto kABlockTransferSrcAccessOrder = + detail::SequenceToArray::value; + static constexpr int kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim; + static constexpr int kABlockTransferSrcScalarPerVector = ABlockTransferSrcScalarPerVector; + static constexpr int kABlockTransferDstScalarPerVectorK1 = ABlockTransferDstScalarPerVector_AK1; + static constexpr bool kABlockLdsExtraM = ABlockLdsExtraM; + + // B block transfer thread cluster dimensions (converted to std::array) + static constexpr auto kBThreadClusterLengths = + detail::SequenceToArray::value; + static constexpr auto kBThreadClusterArrangeOrder = + detail::SequenceToArray::value; + static constexpr auto kBBlockTransferSrcAccessOrder = + detail::SequenceToArray::value; + static constexpr int kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim; + static constexpr int kBBlockTransferSrcScalarPerVector = BBlockTransferSrcScalarPerVector; + static constexpr int kBBlockTransferDstScalarPerVectorK1 = BBlockTransferDstScalarPerVector_BK1; + static constexpr bool kBBlockLdsExtraN = BBlockLdsExtraN; + + // C shuffle parameters (converted to std::array) + static constexpr int kCShuffleMRepeatPerShuffle = CShuffleMRepeatPerShuffle; + static constexpr int kCShuffleNRepeatPerShuffle = CShuffleNRepeatPerShuffle; + static constexpr auto kCDEThreadClusterLengths = detail::SequenceToArray< + CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>::value; + static constexpr int kCDEBlockTransferScalarPerVector = + CDEShuffleBlockTransferScalarPerVector_NPerBlock; + + // Pipeline configuration + static constexpr ck::LoopScheduler kLoopScheduler = LoopSched; + static constexpr ck::PipelineVersion kPipelineVersion = PipelineVer; + + // Static member function to generate instance string + static std::string instance_string() + { + std::ostringstream oss; + + // Kernel type name + oss << "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle"; + + // Template parameters in exact order matching InstanceTraits member order + oss << "<" << kSpatialDim; // 1. NDimSpatial + oss << "," << detail::layout_name(); // 2. ALayout + oss << "," << detail::layout_name(); // 3. BLayout + oss << "," << detail::tuple_name(); // 4. DsLayout + oss << "," << detail::layout_name(); // 5. ELayout + oss << "," << detail::type_name(); // 6. ADataType + oss << "," << detail::type_name(); // 7. BDataType + oss << "," << detail::type_name(); // 8. AccDataType + oss << "," << detail::type_name(); // 9. CShuffleDataType + oss << "," << detail::tuple_name(); // 10. DsDataType + oss << "," << detail::type_name(); // 11. EDataType + oss << "," + << detail::elementwise_op_name(); // 12. AElementwiseOperation + oss << "," + << detail::elementwise_op_name(); // 13. BElementwiseOperation + oss << "," + << detail::elementwise_op_name(); // 14. + // CDEElementwiseOperation + oss << "," + << detail::conv_fwd_spec_name( + kConvForwardSpecialization); // 15. ConvForwardSpecialization + oss << "," << detail::gemm_spec_name(kGemmSpecialization); // 16. GemmSpec + oss << "," << kNumGemmKPrefetchStage; // 17. NumGemmKPrefetchStage + oss << "," << kBlockSize; // 18. BlockSize + oss << "," << kMPerBlock; // 19. MPerBlock + oss << "," << kNPerBlock; // 20. NPerBlock + oss << "," << kKPerBlock; // 21. KPerBlock + oss << "," << kK1; // 22. K1 + oss << "," << kMPerWmma; // 23. MPerWmma + oss << "," << kNPerWmma; // 24. NPerWmma + oss << "," << kMRepeat; // 25. MRepeat + oss << "," << kNRepeat; // 26. NRepeat + oss << "," + << detail::array_to_string( + kAThreadClusterLengths); // 27. ABlockTransferThreadClusterLengths + oss << "," + << detail::array_to_string( + kAThreadClusterArrangeOrder); // 28. ABlockTransferThreadClusterArrangeOrder + oss << "," + << detail::array_to_string( + kABlockTransferSrcAccessOrder); // 29. ABlockTransferSrcAccessOrder + oss << "," << kABlockTransferSrcVectorDim; // 30. ABlockTransferSrcVectorDim + oss << "," << kABlockTransferSrcScalarPerVector; // 31. ABlockTransferSrcScalarPerVector + oss << "," + << kABlockTransferDstScalarPerVectorK1; // 32. ABlockTransferDstScalarPerVector_AK1 + oss << "," << (kABlockLdsExtraM ? "true" : "false"); // 33. ABlockLdsExtraM + oss << "," + << detail::array_to_string( + kBThreadClusterLengths); // 34. BBlockTransferThreadClusterLengths + oss << "," + << detail::array_to_string( + kBThreadClusterArrangeOrder); // 35. BBlockTransferThreadClusterArrangeOrder + oss << "," + << detail::array_to_string( + kBBlockTransferSrcAccessOrder); // 36. BBlockTransferSrcAccessOrder + oss << "," << kBBlockTransferSrcVectorDim; // 37. BBlockTransferSrcVectorDim + oss << "," << kBBlockTransferSrcScalarPerVector; // 38. BBlockTransferSrcScalarPerVector + oss << "," + << kBBlockTransferDstScalarPerVectorK1; // 39. BBlockTransferDstScalarPerVector_BK1 + oss << "," << (kBBlockLdsExtraN ? "true" : "false"); // 40. BBlockLdsExtraN + oss << "," << kCShuffleMRepeatPerShuffle; // 41. CShuffleMRepeatPerShuffle + oss << "," << kCShuffleNRepeatPerShuffle; // 42. CShuffleNRepeatPerShuffle + oss << "," + << detail::array_to_string( + kCDEThreadClusterLengths); // 43. CDEShuffleBlockTransferClusterLengths + oss << "," + << kCDEBlockTransferScalarPerVector; // 44. + // CDEShuffleBlockTransferScalarPerVector_NPerBlock + oss << "," << detail::loop_scheduler_name(kLoopScheduler); // 45. LoopSched + oss << "," << detail::pipeline_version_name(kPipelineVersion); // 46. PipelineVer + oss << ">"; + + return oss.str(); + } +}; + +} // namespace ck_tile::reflect diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp index 0896a9daf9..b41aed784c 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp @@ -32,8 +32,8 @@ template + ck::LoopScheduler LoopSched> struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor; } // namespace ck::tensor_operation::device diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 545441fd90..b13675a7b9 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -161,6 +162,19 @@ constexpr std::string_view pipeline_version_name(ck::BlockGemmPipelineVersion ve } } +// Convert PipelineVersion enum to string (for Wmma kernels) +constexpr std::string_view pipeline_version_name(ck::PipelineVersion ver) +{ + using enum ck::PipelineVersion; + switch(ver) + { + case v1: return "v1"; + case v2: return "v2"; + case v4: return "v4"; + case weight_only: return "weight_only"; + } +} + // Convert LoopScheduler enum to string constexpr std::string_view loop_scheduler_name(ck::LoopScheduler sched) { @@ -322,4 +336,24 @@ constexpr std::string tuple_name() }(static_cast(nullptr)); } +// Concept to check if a type is a ck::Tuple +template +concept IsCkTuple = + requires { [](ck::Tuple*) {}(static_cast(nullptr)); }; + +// Deduces whether to use tuple_name or type_name +// Handles both scalar data types and ck::Tuple types +template +constexpr std::string type_or_type_tuple_name() +{ + if constexpr(IsCkTuple) + { + return tuple_name(); + } + else + { + return std::string(type_name()); + } +} + } // namespace ck_tile::reflect::detail diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index c54c26cb67..66ecf32197 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -28,7 +28,9 @@ add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp) add_ck_builder_test(test_ckb_get_instance_string test_get_instance_string_fwd_grp_conv_v3.cpp test_get_instance_string_fwd_grp_conv.cpp - test_get_instance_string_fwd_grp_conv_large_tensor.cpp) + test_get_instance_string_fwd_grp_conv_large_tensor.cpp + test_get_instance_string_fwd_grp_conv_wmma.cpp + test_get_instance_string_fwd_grp_conv_dl.cpp) # Testing the fwd convolution builder requires kernel compilation. # To enable parallel compilation, the individual tests are split into separate files. diff --git a/experimental/builder/test/test_fwd_instance_traits.cpp b/experimental/builder/test/test_fwd_instance_traits.cpp index 181319bc18..2e3b6ac264 100644 --- a/experimental/builder/test/test_fwd_instance_traits.cpp +++ b/experimental/builder/test/test_fwd_instance_traits.cpp @@ -9,12 +9,21 @@ #include #include #include +#include +#include namespace { using ::testing::ElementsAre; -TEST(InstanceTraitsTest, ConvFwdInstanceTraitsExtraction) +// NOTE: The V3ExtractsAllFieldsCorrectly test below performs detailed field extraction testing +// for the V3 variant as a reference implementation. For new InstanceTraits specializations, +// only the instance_string() functionality needs to be tested. Each new specialization should have: +// 1. A test using instance_string() directly (in this file) +// 2. A test using GetInstanceString() through base class pointer (in separate +// test_get_instance_string_*.cpp file) This prevents test duplication while ensuring both access +// methods work correctly. +TEST(InstanceTraits, V3ExtractsAllFieldsCorrectly) { // Define a concrete instance type with specific template parameters using DeviceInstance = @@ -70,7 +79,8 @@ TEST(InstanceTraitsTest, ConvFwdInstanceTraitsExtraction) ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer ck::half_t, // AComputeDataType - ck::half_t>; // BComputeDataType + ck::half_t, // BComputeDataType + false>; // Use InstanceTraits to extract compile-time information using Traits = ck_tile::reflect::InstanceTraits; @@ -155,7 +165,7 @@ TEST(InstanceTraitsTest, ConvFwdInstanceTraitsExtraction) ck::tensor_operation::element_wise::PassThrough>::value)); } -TEST(InstanceTraitsTest, V3InstanceStringGeneration) +TEST(InstanceTraits, V3InstanceStringReturnsCorrectFormat) { // Define a concrete instance type with specific template parameters using DeviceInstance = @@ -211,7 +221,8 @@ TEST(InstanceTraitsTest, V3InstanceStringGeneration) ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched ck::BlockGemmPipelineVersion::v1, // BlkGemmPipelineVer ck::half_t, // AComputeDataType - ck::half_t>; // BComputeDataType + ck::half_t, // BComputeDataType + false>; // DirectLoad std::string instance_str = ck_tile::reflect::instance_string(); @@ -263,12 +274,13 @@ TEST(InstanceTraitsTest, V3InstanceStringGeneration) ",Intrawave" // BlkGemmPipeSched ",v1" // BlkGemmPipelineVer ",fp16" // AComputeDataType - ",fp16>"; // BComputeDataType + ",fp16" // BComputeDataType + ",false>"; // DirectLoad EXPECT_EQ(instance_str, expected_str); } -TEST(InstanceTraitsTest, BaseInstanceStringGeneration) +TEST(InstanceTraits, BaseInstanceStringReturnsCorrectFormat) { using DeviceInstance = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< @@ -382,7 +394,7 @@ TEST(InstanceTraitsTest, BaseInstanceStringGeneration) EXPECT_EQ(instance_str, expected_str); } -TEST(InstanceTraitsTest, LargeTensorInstanceStringGeneration) +TEST(InstanceTraits, LargeTensorInstanceStringReturnsCorrectFormat) { using DeviceInstance = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor< @@ -497,4 +509,215 @@ TEST(InstanceTraitsTest, LargeTensorInstanceStringGeneration) EXPECT_EQ(instance_str, expected_str); } +TEST(InstanceTraits, WmmaInstanceStringReturnsCorrectFormat) +{ + using DeviceInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle< + 2, // NDimSpatial + ck::tensor_layout::convolution::GNHWC, // ALayout + ck::tensor_layout::convolution::GKYXC, // BLayout + ck::Tuple<>, // DsLayout + ck::tensor_layout::convolution::GNHWK, // ELayout + ck::half_t, // ADataType + ck::half_t, // BDataType + float, // AccDataType + ck::half_t, // CShuffleDataType + ck::Tuple<>, // DsDataType + ck::half_t, // EDataType + ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation + ck::tensor_operation::device::ConvolutionForwardSpecialization:: + Default, // ConvForwardSpecialization + ck::tensor_operation::device::GemmSpecialization::MNKPadding, // GemmSpec + 1, // NumGemmKPrefetchStage + 128, // BlockSize + 64, // MPerBlock + 64, // NPerBlock + 32, // KPerBlock + 8, // K1 + 16, // MPerWmma + 16, // NPerWmma + 2, // MRepeat + 2, // NRepeat + ck::Sequence<4, 32, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + ck::Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + ck::Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 1, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + ck::Sequence<4, 32, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + ck::Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + ck::Sequence<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 1, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, // CShuffleMRepeatPerShuffle + 1, // CShuffleNRepeatPerShuffle + ck::Sequence<1, + 32, + 1, + 4>, // CDEShuffleBlockTransferClusterLengths + 1, // CDEShuffleBlockTransferScalarPerVector_NPerBlock + ck::LoopScheduler::Default, // LoopSched + ck::PipelineVersion::v1>; // PipelineVer + + // Generate instance string + std::string instance_str = ck_tile::reflect::instance_string(); + + // Expected string with all 46 template parameters + std::string expected_str = "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle" + "<2" // NDimSpatial + ",GNHWC" // ALayout + ",GKYXC" // BLayout + ",EmptyTuple" // DsLayout + ",GNHWK" // ELayout + ",fp16" // ADataType + ",fp16" // BDataType + ",fp32" // AccDataType + ",fp16" // CShuffleDataType + ",EmptyTuple" // DsDataType + ",fp16" // EDataType + ",PassThrough" // AElementwiseOperation + ",PassThrough" // BElementwiseOperation + ",PassThrough" // CDEElementwiseOperation + ",Default" // ConvForwardSpecialization + ",MNKPadding" // GemmSpec + ",1" // NumGemmKPrefetchStage + ",128" // BlockSize + ",64" // MPerBlock + ",64" // NPerBlock + ",32" // KPerBlock + ",8" // K1 + ",16" // MPerWmma + ",16" // NPerWmma + ",2" // MRepeat + ",2" // NRepeat + ",Seq(4,32,1)" // ABlockTransferThreadClusterLengths + ",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder + ",Seq(1,0,2)" // ABlockTransferSrcAccessOrder + ",2" // ABlockTransferSrcVectorDim + ",1" // ABlockTransferSrcScalarPerVector + ",8" // ABlockTransferDstScalarPerVector_AK1 + ",true" // ABlockLdsExtraM + ",Seq(4,32,1)" // BBlockTransferThreadClusterLengths + ",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder + ",Seq(1,0,2)" // BBlockTransferSrcAccessOrder + ",2" // BBlockTransferSrcVectorDim + ",1" // BBlockTransferSrcScalarPerVector + ",8" // BBlockTransferDstScalarPerVector_BK1 + ",true" // BBlockLdsExtraN + ",1" // CShuffleMRepeatPerShuffle + ",1" // CShuffleNRepeatPerShuffle + ",Seq(1,32,1,4)" // CDEShuffleBlockTransferClusterLengths + ",1" // CDEShuffleBlockTransferScalarPerVector_NPerBlock + ",Default" // LoopSched + ",v1>"; // PipelineVer + + // Verify the generated string matches exactly + EXPECT_EQ(instance_str, expected_str); +} + +TEST(InstanceTraits, DlInstanceStringReturnsCorrectFormat) +{ + using DeviceInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK< + 2, // NDimSpatial + ck::half_t, // ADataType + ck::half_t, // BDataType + ck::Tuple<>, // DsDataType + ck::half_t, // EDataType + float, // AccDataType + ck::tensor_layout::convolution::GNHWC, // ALayout + ck::tensor_layout::convolution::GKYXC, // BLayout + ck::Tuple<>, // DsLayout + ck::tensor_layout::convolution::GNHWK, // ELayout + ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation + ck::tensor_operation::device::ConvolutionForwardSpecialization:: + Default, // ConvForwardSpecialization + ck::tensor_operation::device::GemmSpecialization::MNKPadding, // GemmSpec + 8, // BlockSize + 16, // MPerBlock + 4, // NPerBlock + 2, // K0PerBlock + 1, // K1 + 1, // M1PerThread + 2, // N1PerThread + 1, // KPerThread + ck::Sequence<4, 2>, // M1N1ThreadClusterM1Xs + ck::Sequence<1, 1>, // M1N1ThreadClusterN1Xs + ck::Sequence<2, 1, 2, 1>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1 + ck::Sequence<1, 1, 8, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1 + ck::Sequence<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder + ck::Sequence<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder + ck::Sequence<1, 1, 1, 1>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 + ck::Sequence<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder + ck::Sequence<1, 1, 1, 1>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 + ck::Sequence<1, 1, 1, 1>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1 + ck::Sequence<2, 1, 4, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1 + ck::Sequence<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder + ck::Sequence<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder + ck::Sequence<1, 1, 1, 1>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 + ck::Sequence<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder + ck::Sequence<1, 1, 1, 1>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 + ck::Sequence<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder + 5, // CThreadTransferSrcDstVectorDim + 1>; // CThreadTransferDstScalarPerVector + + // Generate instance string + std::string instance_str = ck_tile::reflect::instance_string(); + + // Expected string with all 42 template parameters + std::string expected_str = "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK" + "<2" // NDimSpatial + ",fp16" // ADataType + ",fp16" // BDataType + ",EmptyTuple" // DsDataType + ",fp16" // EDataType + ",fp32" // AccDataType + ",GNHWC" // ALayout + ",GKYXC" // BLayout + ",EmptyTuple" // DsLayout + ",GNHWK" // ELayout + ",PassThrough" // AElementwiseOperation + ",PassThrough" // BElementwiseOperation + ",PassThrough" // CDEElementwiseOperation + ",Default" // ConvForwardSpecialization + ",MNKPadding" // GemmSpec + ",8" // BlockSize + ",16" // MPerBlock + ",4" // NPerBlock + ",2" // K0PerBlock + ",1" // K1 + ",1" // M1PerThread + ",2" // N1PerThread + ",1" // KPerThread + ",Seq(4,2)" // M1N1ThreadClusterM1Xs + ",Seq(1,1)" // M1N1ThreadClusterN1Xs + ",Seq(2,1,2,1)" // ABlockTransferThreadSliceLengths_K0_M0_M1_K1 + ",Seq(1,1,8,1)" // ABlockTransferThreadClusterLengths_K0_M0_M1_K1 + ",Seq(1,2,0,3)" // ABlockTransferThreadClusterArrangeOrder + ",Seq(1,2,0,3)" // ABlockTransferSrcAccessOrder + ",Seq(1,1,1,1)" // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 + ",Seq(1,2,0,3)" // ABlockTransferSrcVectorTensorContiguousDimOrder + ",Seq(1,1,1,1)" // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 + ",Seq(1,1,1,1)" // BBlockTransferThreadSliceLengths_K0_N0_N1_K1 + ",Seq(2,1,4,1)" // BBlockTransferThreadClusterLengths_K0_N0_N1_K1 + ",Seq(1,2,0,3)" // BBlockTransferThreadClusterArrangeOrder + ",Seq(1,2,0,3)" // BBlockTransferSrcAccessOrder + ",Seq(1,1,1,1)" // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 + ",Seq(1,2,0,3)" // BBlockTransferSrcVectorTensorContiguousDimOrder + ",Seq(1,1,1,1)" // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 + ",Seq(0,1,2,3,4,5)" // CThreadTransferSrcDstAccessOrder + ",5" // CThreadTransferSrcDstVectorDim + ",1>"; // CThreadTransferDstScalarPerVector + + // Verify the generated string matches exactly + EXPECT_EQ(instance_str, expected_str); +} + } // anonymous namespace diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp index b2b0fb2389..7c43c428ce 100644 --- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp +++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include // Test GetInstanceString through base class pointer for non-V3 variant @@ -22,22 +22,8 @@ TEST(GetInstanceString, ReturnsStringForFwdGrpConvInstance) // Get the first instance from the tuple using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type; - // Define the base class type using DeviceGroupedConvFwdMultipleABD - using BaseClass = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< - 2, // NDimSpatial - ck::tensor_operation::device::instance::GNHWC, // ALayout - ck::tensor_operation::device::instance::GKYXC, // BLayout - ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout - ck::tensor_operation::device::instance::GNHWK, // ELayout - ck::half_t, // ADataType - ck::half_t, // BDataType - ck::Tuple<>, // DsDataType - ck::half_t, // EDataType - ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation - ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation - ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation - ck::half_t, // AComputeType - ck::half_t>; // BComputeType + // Define the base class type using the most general operator base + using BaseClass = ck::tensor_operation::device::BaseOperator; // Create an instance of the derived class DeviceInstance device_instance; diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_dl.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_dl.cpp new file mode 100644 index 0000000000..54e026b308 --- /dev/null +++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_dl.cpp @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +// Test GetInstanceString through base class pointer for DL variant +TEST(GetInstanceString, ReturnsStringForFwdGrpConvDlInstance) +{ + // Use the template helper to get a working instance configuration + using InstanceTuple = + ck::tensor_operation::device::instance::device_grouped_conv2d_fwd_dl_f16_instances< + ck::tensor_operation::device::instance::GNHWC, // InLayout + ck::tensor_operation::device::instance::GKYXC, // WeiLayout + ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout + ck::tensor_operation::device::instance::GNHWK, // OutLayout + ck::Tuple<>, // DsDatatype + ck::tensor_operation::element_wise::PassThrough, // CDEElementOp + ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvSpec + + // Get the first instance from the tuple + using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type; + + // Define the base class type using the most general operator base + using BaseClass = ck::tensor_operation::device::BaseOperator; + + // Create an instance of the derived class + DeviceInstance device_instance; + + // Get a pointer to the base class + BaseClass* base_ptr = &device_instance; + + // Call GetInstanceString through the base class pointer + std::string instance_str = base_ptr->GetInstanceString(); + + // Expected complete instance string based on the first instance from + // device_grouped_conv2d_fwd_dl_f16_instances + std::string expected_str = "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK" + "<2" // NDimSpatial + ",fp16" // ADataType + ",fp16" // BDataType + ",EmptyTuple" // DsDataType + ",fp16" // EDataType + ",fp32" // AccDataType + ",GNHWC" // ALayout + ",GKYXC" // BLayout + ",EmptyTuple" // DsLayout + ",GNHWK" // ELayout + ",PassThrough" // AElementwiseOperation + ",PassThrough" // BElementwiseOperation + ",PassThrough" // CDEElementwiseOperation + ",Default" // ConvForwardSpecialization + ",MNKPadding" // GemmSpec + ",8" // BlockSize + ",16" // MPerBlock + ",4" // NPerBlock + ",2" // K0PerBlock + ",1" // K1 + ",1" // M1PerThread + ",2" // N1PerThread + ",1" // KPerThread + ",Seq(4,2)" // M1N1ThreadClusterM1Xs + ",Seq(1,1)" // M1N1ThreadClusterN1Xs + ",Seq(2,1,2,1)" // ABlockTransferThreadSliceLengths_K0_M0_M1_K1 + ",Seq(1,1,8,1)" // ABlockTransferThreadClusterLengths_K0_M0_M1_K1 + ",Seq(1,2,0,3)" // ABlockTransferThreadClusterArrangeOrder + ",Seq(1,2,0,3)" // ABlockTransferSrcAccessOrder + ",Seq(1,1,1,1)" // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 + ",Seq(1,2,0,3)" // ABlockTransferSrcVectorTensorContiguousDimOrder + ",Seq(1,1,1,1)" // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 + ",Seq(1,1,1,1)" // BBlockTransferThreadSliceLengths_K0_N0_N1_K1 + ",Seq(2,1,4,1)" // BBlockTransferThreadClusterLengths_K0_N0_N1_K1 + ",Seq(1,2,0,3)" // BBlockTransferThreadClusterArrangeOrder + ",Seq(1,2,0,3)" // BBlockTransferSrcAccessOrder + ",Seq(1,1,1,1)" // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 + ",Seq(1,2,0,3)" // BBlockTransferSrcVectorTensorContiguousDimOrder + ",Seq(1,1,1,1)" // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 + ",Seq(0,1,2,3,4,5)" // CThreadTransferSrcDstAccessOrder + ",5" // CThreadTransferSrcDstVectorDim + ",1>"; // CThreadTransferDstScalarPerVector + + EXPECT_EQ(instance_str, expected_str); +} diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp index 4d50c34ea3..fd75fd9e6b 100644 --- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp +++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include // Test GetInstanceString through base class pointer for large tensor variant @@ -22,22 +22,8 @@ TEST(GetInstanceString, ReturnsStringForFwdGrpConvLargeTensorInstance) // Get the first instance from the tuple using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type; - // Define the base class type using DeviceGroupedConvFwdMultipleABD - using BaseClass = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< - 2, // NDimSpatial - ck::tensor_operation::device::instance::GNHWC, // ALayout - ck::tensor_operation::device::instance::GKYXC, // BLayout - ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout - ck::tensor_operation::device::instance::GNHWK, // ELayout - ck::half_t, // ADataType - ck::half_t, // BDataType - ck::Tuple<>, // DsDataType - ck::half_t, // EDataType - ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation - ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation - ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation - ck::half_t, // AComputeType - ck::half_t>; // BComputeType + // Define the base class type using the most general operator base + using BaseClass = ck::tensor_operation::device::BaseOperator; // Create an instance of the derived class DeviceInstance device_instance; diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp index 6870f6e5d0..45ab5db322 100644 --- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp +++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include // Test GetInstanceString through base class pointer for V3 variant @@ -22,22 +22,8 @@ TEST(GetInstanceString, ReturnsStringForFwdGrpConvV3Instance) // Get the first instance from the tuple using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type; - // Define the base class type using DeviceGroupedConvFwdMultipleABD - using BaseClass = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< - 2, // NDimSpatial - ck::tensor_operation::device::instance::GNHWC, // ALayout - ck::tensor_operation::device::instance::GKYXC, // BLayout - ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout - ck::tensor_operation::device::instance::GNHWK, // ELayout - ck::half_t, // ADataType - ck::half_t, // BDataType - ck::Tuple<>, // DsDataType - ck::half_t, // EDataType - ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation - ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation - ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation - ck::half_t, // AComputeType - ck::half_t>; // BComputeType + // Define the base class type using the most general operator base + using BaseClass = ck::tensor_operation::device::BaseOperator; // Create an instance of the derived class DeviceInstance device_instance; @@ -99,6 +85,7 @@ TEST(GetInstanceString, ReturnsStringForFwdGrpConvV3Instance) ",Intrawave" // BlkGemmPipeSched ",v4" // BlkGemmPipelineVer ",fp16" // AComputeDataType - ",fp16>"; // BComputeDataType + ",fp16" // BComputeDataType + ",false>"; // DirectLoad EXPECT_EQ(instance_str, expected_str); } diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_wmma.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_wmma.cpp new file mode 100644 index 0000000000..e1505659f5 --- /dev/null +++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_wmma.cpp @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +// Test GetInstanceString through base class pointer for Wmma variant +TEST(GetInstanceString, ReturnsStringForFwdGrpConvWmmaInstance) +{ + // Use the template helper to get a working instance configuration + using InstanceTuple = + ck::tensor_operation::device::instance::device_grouped_conv_fwd_wmma_f16_instances< + 2, // NDimSpatial + ck::tensor_operation::device::instance::GNHWC, // ALayout + ck::tensor_operation::device::instance::GKYXC, // BLayout + ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout + ck::tensor_operation::device::instance::GNHWK, // ELayout + ck::Tuple<>, // DsDatatype + ck::tensor_operation::element_wise::PassThrough, // CDEElementOp + ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvSpec + + // Get the first instance from the tuple + using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type; + + // Define the base class type using the most general operator base + using BaseClass = ck::tensor_operation::device::BaseOperator; + + // Create an instance of the derived class + DeviceInstance device_instance; + + // Get a pointer to the base class + BaseClass* base_ptr = &device_instance; + + // Call GetInstanceString through the base class pointer + std::string instance_str = base_ptr->GetInstanceString(); + + // Expected complete instance string based on the first instance from + // device_grouped_conv_fwd_wmma_f16_instances + std::string expected_str = "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle" + "<2" // NDimSpatial + ",GNHWC" // ALayout + ",GKYXC" // BLayout + ",EmptyTuple" // DsLayout + ",GNHWK" // ELayout + ",fp16" // ADataType + ",fp16" // BDataType + ",fp32" // AccDataType + ",fp16" // CShuffleDataType + ",EmptyTuple" // DsDataType + ",fp16" // EDataType + ",PassThrough" // AElementwiseOperation + ",PassThrough" // BElementwiseOperation + ",PassThrough" // CDEElementwiseOperation + ",Default" // ConvForwardSpecialization + ",MNKPadding" // GemmSpec + ",1" // NumGemmKPrefetchStage + ",128" // BlockSize + ",64" // MPerBlock + ",64" // NPerBlock + ",32" // KPerBlock + ",8" // K1 + ",16" // MPerWmma + ",16" // NPerWmma + ",2" // MRepeat + ",2" // NRepeat + ",Seq(4,32,1)" // ABlockTransferThreadClusterLengths + ",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder + ",Seq(1,0,2)" // ABlockTransferSrcAccessOrder + ",2" // ABlockTransferSrcVectorDim + ",1" // ABlockTransferSrcScalarPerVector + ",8" // ABlockTransferDstScalarPerVector_AK1 + ",true" // ABlockLdsExtraM + ",Seq(4,32,1)" // BBlockTransferThreadClusterLengths + ",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder + ",Seq(1,0,2)" // BBlockTransferSrcAccessOrder + ",2" // BBlockTransferSrcVectorDim + ",1" // BBlockTransferSrcScalarPerVector + ",8" // BBlockTransferDstScalarPerVector_BK1 + ",true" // BBlockLdsExtraN + ",1" // CShuffleMRepeatPerShuffle + ",1" // CShuffleNRepeatPerShuffle + ",Seq(1,32,1,4)" // CDEShuffleBlockTransferClusterLengths + ",1" // CDEShuffleBlockTransferScalarPerVector_NPerBlock + ",Default" // LoopSched + ",v1>"; // PipelineVer + + EXPECT_EQ(instance_str, expected_str); +} diff --git a/experimental/builder/test/test_instance_traits_util.cpp b/experimental/builder/test/test_instance_traits_util.cpp index fe31e04e89..04d3cb978b 100644 --- a/experimental/builder/test/test_instance_traits_util.cpp +++ b/experimental/builder/test/test_instance_traits_util.cpp @@ -202,8 +202,8 @@ TEST(InstanceTraitsUtil, PipelineVersionNameReturnsCorrectStrings) TEST(InstanceTraitsUtil, LoopSchedulerNameReturnsCorrectStrings) { using enum ck::LoopScheduler; - EXPECT_THAT(std::vector names = {loop_scheduler_name(Default), - loop_scheduler_name(Interwave)}, + EXPECT_THAT((std::vector{loop_scheduler_name(Default), + loop_scheduler_name(Interwave)}), ElementsAre("Default", "Interwave")); } @@ -267,5 +267,15 @@ TEST(InstanceTraitsUtil, SequenceNameReturnsSeqStringForMultipleValueSequence) EXPECT_EQ((sequence_name>()), "Seq(256,128,64,32,16)"); } +TEST(InstanceTraitsUtil, TypeOrTypeTupleNameReturnsCorrectStringForScalarDataType) +{ + EXPECT_EQ(type_or_type_tuple_name(), "fp32"); +} + +TEST(InstanceTraitsUtil, TypeOrTypeTupleNameReturnsCorrectStringForTupleOfDataTypes) +{ + EXPECT_EQ((type_or_type_tuple_name>()), "Tuple(fp16,fp32)"); +} + } // namespace } // namespace ck_tile::reflect::detail diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp index 330f7fd809..3754eb2080 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp @@ -24,6 +24,10 @@ #include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/io.hpp" +#ifdef CK_EXPERIMENTAL_BUILDER +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +#endif + namespace ck { namespace tensor_operation { namespace device { @@ -1048,6 +1052,19 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK return str.str(); } + +#ifdef CK_EXPERIMENTAL_BUILDER + std::string GetInstanceString() const override + { + static_assert(ck_tile::reflect::HasInstanceTraits, + "Specialization of instance_traits not found. Please check that a " + "specialization exists in file " + "ck_tile/builder/reflect/" + "instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp " + "for the given template parameters."); + return ck_tile::reflect::instance_string(); + } +#endif }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 1c9f6b0094..ef0f01d08c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -24,6 +24,10 @@ #include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/io.hpp" +#ifdef CK_EXPERIMENTAL_BUILDER +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp" +#endif + namespace ck { namespace tensor_operation { namespace device { @@ -1003,6 +1007,19 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle return str.str(); } + +#ifdef CK_EXPERIMENTAL_BUILDER + std::string GetInstanceString() const override + { + static_assert(ck_tile::reflect::HasInstanceTraits, + "Specialization of instance_traits not found. Please check that a " + "specialization exists in file " + "ck_tile/builder/reflect/" + "instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp " + "for the given template parameters."); + return ck_tile::reflect::instance_string(); + } +#endif }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index a0b33afb4d..d0802ff65d 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -1760,6 +1760,8 @@ struct FastNumericArrayConverter struct DynamicUnaryOp { + static constexpr const char* name = "DynamicUnaryOp"; + __host__ __device__ DynamicUnaryOp() = delete; __host__ __device__ DynamicUnaryOp(const Swish& swish)