[CK_BUILDER] Add Description::instance_string() method and update tests (#3340)

* Create Description::instance_string() function

To expose more reflection capabilities in MIOpen, we add the instance_string functionality to the ckr::Description class. This PR introduces a base class, adds the instance_string method, and implements the method by injecting the Traits::instance_string method through the ConvDescription constructor.

This will enable us to replace the specialized get_instance_string() method on device operations with a describe() method in a subsequent PR.

* Test describe().instance_string()

Update the instance string tests to also call `ckr::describe<Instance>().instance_string()`. This documents that the xld kernels are supported with describe(), but WMMA and DL kernels are not yet supported. Also update namespace and add a HasConvTraits concept.
This commit is contained in:
John Shumway
2025-12-03 06:36:09 -08:00
committed by GitHub
parent e6a583416b
commit f29b67cf9b
18 changed files with 802 additions and 670 deletions

View File

@@ -103,18 +103,18 @@ add_ck_builder_test(test_ckb_conv_builder
# compilable code. They are more expensive but catch real-world issues.
# Verifies that GetInstanceString() methods produce valid kernel code.
# Verifies that GetInstanceString() methods and other functions produce valid kernel code.
# Tests various convolution types:
# - Group convolution (v3, standard, large tensor, WMMA, DL variants)
# - Backward weight group convolution (XDL)
# Requires kernel compilation to validate the generated strings.
add_ck_builder_test(test_ckb_get_instance_string
test_get_instance_string_fwd_grp_conv_v3.cpp
test_get_instance_string_fwd_grp_conv.cpp
test_get_instance_string_fwd_grp_conv_large_tensor.cpp
test_get_instance_string_fwd_grp_conv_wmma.cpp
test_get_instance_string_fwd_grp_conv_dl.cpp
test_get_instance_string_bwd_weight_grp_conv_xdl.cpp)
# Requires kernel compilation to validate the generated strings through the base class.
add_ck_builder_test(test_ckb_instance_string
test_instance_string_fwd_grp_conv_v3.cpp
test_instance_string_fwd_grp_conv.cpp
test_instance_string_fwd_grp_conv_large_tensor.cpp
test_instance_string_fwd_grp_conv_wmma.cpp
test_instance_string_fwd_grp_conv_dl.cpp
test_instance_string_bwd_weight_grp_conv_xdl.cpp)
# Tests the forward convolution builder across multiple data types and dimensions.
# Individual tests are split into separate files to enable parallel compilation.
@@ -183,7 +183,7 @@ endforeach()
# Register all regression tests (integration tests with kernel compilation)
set(CKB_REGRESSION_TESTS
test_ckb_get_instance_string
test_ckb_instance_string
test_ckb_build_fwd_instances
test_ckb_testing_utils
test_ckb_factory_grouped_convolution_forward_convscale

View File

@@ -13,7 +13,7 @@
namespace {
namespace ckb = ck_tile::builder;
namespace ckr = ck_tile::reflect::conv;
namespace ckr = ck_tile::reflect;
namespace ckt = ck_tile::test;
// Defines the signature of the convolution operation to be tested.
@@ -110,7 +110,7 @@ TEST(ConvDescriptionTest, DefaultInstanceHasBriefDescription)
static constexpr const ConvSignature SIGNATURE;
static constexpr const DefaultAlgorithm ALGORITHM;
using Instance = ckb::ConvBuilder<SIGNATURE, ALGORITHM>::Instance;
EXPECT_THAT(ckr::Describe<Instance>().brief(), ckt::StringEqWithDiff("2D Forward convolution"));
EXPECT_THAT(ckr::describe<Instance>().brief(), ckt::StringEqWithDiff("2D Forward convolution"));
}
TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
@@ -118,7 +118,7 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
static constexpr const ConvSignature SIGNATURE;
static constexpr const DefaultAlgorithm ALGORITHM;
using Instance = ckb::ConvBuilder<SIGNATURE, ALGORITHM>::Instance;
EXPECT_THAT(ckr::Describe<Instance>().detailed(),
EXPECT_THAT(ckr::describe<Instance>().detailed(),
ckt::StringEqWithDiff( //
"2D Forward Convolution Kernel\n"
"├─ Signature\n"
@@ -162,6 +162,23 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
" └─ Vector access (GMEM write) instruction size: 2"));
}
TEST(ConvDescriptionTest, DefaultInstanceHasInstanceString)
{
static constexpr const ConvSignature SIGNATURE;
static constexpr const DefaultAlgorithm ALGORITHM;
using Instance = ckb::ConvBuilder<SIGNATURE, ALGORITHM>::Instance;
// Get the instance string from the description
std::string instance_str = ckr::describe<Instance>().instance_string();
// Verify that the instance string is not empty
EXPECT_FALSE(instance_str.empty());
// Verify that it contains the device operation name
// The exact format depends on the InstanceTraits implementation
EXPECT_THAT(instance_str, ::testing::HasSubstr("DeviceGroupedConvFwdMultipleABD"));
}
// NOTE: BackwardDataInstanceHasDetailedDescription test is disabled because ConvFactory
// does not have a specialization for backward data convolutions. The test fails with:
// "implicit instantiation of undefined template 'ck_tile::builder::ConvFactory<...>'"
@@ -195,4 +212,5 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
// EXPECT_THAT(ckr::Describe<Builder>().detailed(),
// ckt::StringEqWithDiff("PLACEHOLDER"));
// }
} // namespace

View File

@@ -1,86 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp>
// Test GetInstanceString through base class pointer for backward weight XDL variant
TEST(GetInstanceString, ReturnsStringForBwdWeightGrpConvXdlInstance)
{
// Use the template helper to get a working instance configuration
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::GNHWK, // OutLayout
ck::tensor_operation::device::instance::
ConvBwdWeightDefault>; // ConvBwdWeightSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Define the base class type using the most general operator base
using BaseClass = ck::tensor_operation::device::BaseOperator;
// Create an instance of the derived class
DeviceInstance device_instance;
// Get a pointer to the base class
BaseClass* base_ptr = &device_instance;
// Call GetInstanceString through the base class pointer
std::string instance_str = base_ptr->GetInstanceString();
// Expected complete instance string based on the first instance from
// device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances
// This corresponds to the configuration with BlockSize=64, MPerBlock=64, NPerBlock=64, etc.
std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",4" // K0PerBlock
",8" // K1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(1,4,8,2)" // ABlockTransferThreadClusterLengths_K0_M_K1
",Seq(0,3,1,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",4" // ABlockTransferDstScalarPerVector_K1
",true" // ABlockLdsAddExtraM
",Seq(1,4,8,2)" // BBlockTransferThreadClusterLengths_K0_N_K1
",Seq(0,3,1,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",4" // BBlockTransferDstScalarPerVector_K1
",true" // BBlockLdsAddExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CBlockTransferClusterLengths
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // MaxTransposeTransferSrcScalarPerVector
",1>"; // MaxTransposeTransferDstScalarPerVector
EXPECT_EQ(instance_str, expected_str);
}

View File

@@ -1,90 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp>
// Test GetInstanceString through base class pointer for non-V3 variant
TEST(GetInstanceString, ReturnsStringForFwdGrpConvInstance)
{
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_fwd_xdl_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Define the base class type using the most general operator base
using BaseClass = ck::tensor_operation::device::BaseOperator;
// Create an instance of the derived class
DeviceInstance device_instance;
// Get a pointer to the base class
BaseClass* base_ptr = &device_instance;
// Call GetInstanceString through the base class pointer
std::string instance_str = base_ptr->GetInstanceString();
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_xdl_f16_instances
std::string expected_str = "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",1" // NumGemmKPrefetchStage
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",32" // KPerBlock
",8" // AK1
",8" // BK1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(4,16,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",1" // ABlockLdsExtraM
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",1" // BBlockLdsExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CDEBlockTransferClusterLengths
",1" // CDEBlockTransferScalarPerVector_NPerBlock
",fp16" // AComputeDataType
",fp16" // BComputeDataType
",Default" // LoopScheduler
",1>"; // NumGroupsToMerge
EXPECT_EQ(instance_str, expected_str);
}

View File

@@ -1,85 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_dl_instance.hpp>
// Test GetInstanceString through base class pointer for DL variant
TEST(GetInstanceString, ReturnsStringForFwdGrpConvDlInstance)
{
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv2d_fwd_dl_f16_instances<
ck::tensor_operation::device::instance::GNHWC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // OutLayout
ck::Tuple<>, // DsDatatype
ck::tensor_operation::element_wise::PassThrough, // CDEElementOp
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvSpec
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Define the base class type using the most general operator base
using BaseClass = ck::tensor_operation::device::BaseOperator;
// Create an instance of the derived class
DeviceInstance device_instance;
// Get a pointer to the base class
BaseClass* base_ptr = &device_instance;
// Call GetInstanceString through the base class pointer
std::string instance_str = base_ptr->GetInstanceString();
// Expected complete instance string based on the first instance from
// device_grouped_conv2d_fwd_dl_f16_instances
std::string expected_str = "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK"
"<2" // NDimSpatial
",fp16" // ADataType
",fp16" // BDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",fp32" // AccDataType
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",8" // BlockSize
",16" // MPerBlock
",4" // NPerBlock
",2" // K0PerBlock
",1" // K1
",1" // M1PerThread
",2" // N1PerThread
",1" // KPerThread
",Seq(4,2)" // M1N1ThreadClusterM1Xs
",Seq(1,1)" // M1N1ThreadClusterN1Xs
",Seq(2,1,2,1)" // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
",Seq(1,1,8,1)" // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
",Seq(1,2,0,3)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,2,0,3)" // ABlockTransferSrcAccessOrder
",Seq(1,1,1,1)" // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
",Seq(1,2,0,3)" // ABlockTransferSrcVectorTensorContiguousDimOrder
",Seq(1,1,1,1)" // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
",Seq(1,1,1,1)" // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
",Seq(2,1,4,1)" // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
",Seq(1,2,0,3)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,2,0,3)" // BBlockTransferSrcAccessOrder
",Seq(1,1,1,1)" // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
",Seq(1,2,0,3)" // BBlockTransferSrcVectorTensorContiguousDimOrder
",Seq(1,1,1,1)" // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
",Seq(0,1,2,3,4,5)" // CThreadTransferSrcDstAccessOrder
",5" // CThreadTransferSrcDstVectorDim
",1>"; // CThreadTransferDstScalarPerVector
EXPECT_EQ(instance_str, expected_str);
}

View File

@@ -1,89 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp>
// Test GetInstanceString through base class pointer for large tensor variant
TEST(GetInstanceString, ReturnsStringForFwdGrpConvLargeTensorInstance)
{
// Use the template helper to get a working instance configuration
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_fwd_xdl_large_tensor_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Define the base class type using the most general operator base
using BaseClass = ck::tensor_operation::device::BaseOperator;
// Create an instance of the derived class
DeviceInstance device_instance;
// Get a pointer to the base class
BaseClass* base_ptr = &device_instance;
// Call GetInstanceString through the base class pointer
std::string instance_str = base_ptr->GetInstanceString();
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_xdl_large_tensor_f16_instances
std::string expected_str = "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",1" // NumGemmKPrefetchStage
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",32" // KPerBlock
",8" // AK1
",8" // BK1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(4,16,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",1" // ABlockLdsExtraM
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",1" // BBlockLdsExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CDEBlockTransferClusterLengths
",1" // CDEBlockTransferScalarPerVector_NPerBlock
",fp16" // AComputeDataType
",fp16" // BComputeDataType
",Default>"; // LoopScheduler
EXPECT_EQ(instance_str, expected_str);
}

View File

@@ -1,91 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp>
// Test GetInstanceString through base class pointer for V3 variant
TEST(GetInstanceString, ReturnsStringForFwdGrpConvV3Instance)
{
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_fwd_xdl_f16_comp_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Define the base class type using the most general operator base
using BaseClass = ck::tensor_operation::device::BaseOperator;
// Create an instance of the derived class
DeviceInstance device_instance;
// Get a pointer to the base class
BaseClass* base_ptr = &device_instance;
// Call GetInstanceString through the base class pointer
std::string instance_str = base_ptr->GetInstanceString();
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_xdl_f16_comp_instances This corresponds to the configuration with
// BlockSize=256, MPerBlock=128, NPerBlock=128, KPerBlock=64, etc.
std::string expected_str = "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",256" // BlockSize
",128" // MPerBlock
",128" // NPerBlock
",64" // KPerBlock
",8" // AK1
",8" // BK1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(8,32,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",8" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",0" // ABlockLdsExtraM
",Seq(8,32,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",8" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",0" // BBlockLdsExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,32,1,8)" // CDEBlockTransferClusterLengths
",8" // CDEBlockTransferScalarPerVector_NPerBlock
",Intrawave" // BlkGemmPipeSched
",v4" // BlkGemmPipelineVer
",fp16" // AComputeDataType
",fp16" // BComputeDataType
",false>"; // DirectLoad
EXPECT_EQ(instance_str, expected_str);
}

View File

@@ -1,90 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp>
// Test GetInstanceString through base class pointer for Wmma variant
TEST(GetInstanceString, ReturnsStringForFwdGrpConvWmmaInstance)
{
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_fwd_wmma_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::Tuple<>, // DsDatatype
ck::tensor_operation::element_wise::PassThrough, // CDEElementOp
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvSpec
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Define the base class type using the most general operator base
using BaseClass = ck::tensor_operation::device::BaseOperator;
// Create an instance of the derived class
DeviceInstance device_instance;
// Get a pointer to the base class
BaseClass* base_ptr = &device_instance;
// Call GetInstanceString through the base class pointer
std::string instance_str = base_ptr->GetInstanceString();
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_wmma_f16_instances
std::string expected_str = "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",1" // NumGemmKPrefetchStage
",128" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",32" // KPerBlock
",8" // K1
",16" // MPerWmma
",16" // NPerWmma
",2" // MRepeat
",2" // NRepeat
",Seq(4,32,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",true" // ABlockLdsExtraM
",Seq(4,32,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",true" // BBlockLdsExtraN
",1" // CShuffleMRepeatPerShuffle
",1" // CShuffleNRepeatPerShuffle
",Seq(1,32,1,4)" // CDEShuffleBlockTransferClusterLengths
",1" // CDEShuffleBlockTransferScalarPerVector_NPerBlock
",Default" // LoopSched
",v1>"; // PipelineVer
EXPECT_EQ(instance_str, expected_str);
}

View File

@@ -0,0 +1,90 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
// Use the template helper to get a working instance configuration
using InstanceTuple = ck::tensor_operation::device::instance::
device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::GNHWK, // OutLayout
ck::tensor_operation::device::instance::
ConvBwdWeightDefault>; // ConvBwdWeightSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected complete instance string based on the first instance from
// device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances
// This corresponds to the configuration with BlockSize=64, MPerBlock=64, NPerBlock=64, etc.
std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // InLayout
",GKYXC" // WeiLayout
",GNHWK" // OutLayout
",fp16" // InDataType
",fp16" // WeiDataType
",fp16" // OutDataType
",fp32" // AccDataType
",PassThrough" // InElementwiseOperation
",PassThrough" // WeiElementwiseOperation
",PassThrough" // OutElementwiseOperation
",Default" // ConvBackwardWeightSpecialization
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",4" // K0PerBlock
",8" // K1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(1,4,8,2)" // ABlockTransferThreadClusterLengths_K0_M_K1
",Seq(0,3,1,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",2" // ABlockTransferSrcScalarPerVector
",4" // ABlockTransferDstScalarPerVector_K1
",true" // ABlockLdsAddExtraM
",Seq(1,4,8,2)" // BBlockTransferThreadClusterLengths_K0_N_K1
",Seq(0,3,1,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(0,2,1,3)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",2" // BBlockTransferSrcScalarPerVector
",4" // BBlockTransferDstScalarPerVector_K1
",true" // BBlockLdsAddExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CBlockTransferClusterLengths
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
",fp16" // ComputeTypeA
",fp16" // ComputeTypeB
",1" // MaxTransposeTransferSrcScalarPerVector
",1>"; // MaxTransposeTransferDstScalarPerVector
// Test GetInstanceString through base class pointer for backward weight XDL variant
TEST(InstanceString, GetInstanceStringReturnsCorrectValueForBwdWeightGrpConvXdl)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
}
// TODO: Add DescriptionReturnsCorrectValueForBwdWeightGrpConvXdl test once ckr::describe supports
// backward weight convolution The backward weight variant uses different layout naming (InLayout,
// WeiLayout, OutLayout) instead of (ALayout, BLayout, ELayout) and has
// ConvBackwardWeightSpecialization
} // namespace

View File

@@ -0,0 +1,95 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "gtest/gtest.h"
#include "ck_tile/builder/reflect/instance_traits.hpp"
#include "ck_tile/builder/reflect/conv_description.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
namespace {
namespace ckr = ck_tile::reflect;
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_fwd_xdl_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_xdl_f16_instances
std::string expected_str = "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",1" // NumGemmKPrefetchStage
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",32" // KPerBlock
",8" // AK1
",8" // BK1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(4,16,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",1" // ABlockLdsExtraM
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",1" // BBlockLdsExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CDEBlockTransferClusterLengths
",1" // CDEBlockTransferScalarPerVector_NPerBlock
",fp16" // AComputeDataType
",fp16" // BComputeDataType
",Default" // LoopScheduler
",1>"; // NumGroupsToMerge
// Test GetInstanceString through base class pointer for standard XDL variant
TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConv)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
}
TEST(InstanceString, DescriptionReturnsCorrectValueForFwdGrpConv)
{
EXPECT_EQ(ckr::describe<DeviceInstance>().instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,88 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck_tile/builder/reflect/conv_description.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_dl_instance.hpp>
namespace {
namespace ckr = ck_tile::reflect;
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv2d_fwd_dl_f16_instances<
ck::tensor_operation::device::instance::GNHWC, // InLayout
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // OutLayout
ck::Tuple<>, // DsDatatype
ck::tensor_operation::element_wise::PassThrough, // CDEElementOp
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvSpec
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected complete instance string based on the first instance from
// device_grouped_conv2d_fwd_dl_f16_instances
std::string expected_str = "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK"
"<2" // NDimSpatial
",fp16" // ADataType
",fp16" // BDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",fp32" // AccDataType
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",8" // BlockSize
",16" // MPerBlock
",4" // NPerBlock
",2" // K0PerBlock
",1" // K1
",1" // M1PerThread
",2" // N1PerThread
",1" // KPerThread
",Seq(4,2)" // M1N1ThreadClusterM1Xs
",Seq(1,1)" // M1N1ThreadClusterN1Xs
",Seq(2,1,2,1)" // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
",Seq(1,1,8,1)" // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
",Seq(1,2,0,3)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,2,0,3)" // ABlockTransferSrcAccessOrder
",Seq(1,1,1,1)" // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
",Seq(1,2,0,3)" // ABlockTransferSrcVectorTensorContiguousDimOrder
",Seq(1,1,1,1)" // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
",Seq(1,1,1,1)" // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
",Seq(2,1,4,1)" // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
",Seq(1,2,0,3)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,2,0,3)" // BBlockTransferSrcAccessOrder
",Seq(1,1,1,1)" // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
",Seq(1,2,0,3)" // BBlockTransferSrcVectorTensorContiguousDimOrder
",Seq(1,1,1,1)" // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
",Seq(0,1,2,3,4,5)" // CThreadTransferSrcDstAccessOrder
",5" // CThreadTransferSrcDstVectorDim
",1>"; // CThreadTransferDstScalarPerVector
// Test GetInstanceString through base class pointer for DL variant
TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvDl)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
}
// TODO: Add DescriptionReturnsCorrectValueForFwdGrpConvDl test once ckr::describe supports DL
// variants The DL variant uses different internal structure (K0PerBlock, K1, M1PerThread, etc.) and
// is missing standard members like kKPerBlock, kMPerXDL, kAK1
} // namespace

View File

@@ -0,0 +1,94 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck_tile/builder/reflect/conv_description.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp>
namespace {
namespace ckr = ck_tile::reflect;
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_fwd_xdl_large_tensor_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_xdl_large_tensor_f16_instances
std::string expected_str = "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",1" // NumGemmKPrefetchStage
",64" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",32" // KPerBlock
",8" // AK1
",8" // BK1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(4,16,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",1" // ABlockLdsExtraM
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",1" // BBlockLdsExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,16,1,4)" // CDEBlockTransferClusterLengths
",1" // CDEBlockTransferScalarPerVector_NPerBlock
",fp16" // AComputeDataType
",fp16" // BComputeDataType
",Default>"; // LoopScheduler
// Test GetInstanceString through base class pointer for large tensor variant
TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvLargeTensor)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
}
TEST(InstanceString, DescriptionReturnsCorrectValueForFwdGrpConvLargeTensor)
{
EXPECT_EQ(ckr::describe<DeviceInstance>().instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,96 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck_tile/builder/reflect/conv_description.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp>
namespace {
namespace ckr = ck_tile::reflect;
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_fwd_xdl_f16_comp_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_xdl_f16_comp_instances This corresponds to the configuration with
// BlockSize=256, MPerBlock=128, NPerBlock=128, KPerBlock=64, etc.
std::string expected_str = "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",256" // BlockSize
",128" // MPerBlock
",128" // NPerBlock
",64" // KPerBlock
",8" // AK1
",8" // BK1
",32" // MPerXDL
",32" // NPerXDL
",2" // MXdlPerWave
",2" // NXdlPerWave
",Seq(8,32,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",8" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",0" // ABlockLdsExtraM
",Seq(8,32,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",8" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",0" // BBlockLdsExtraN
",1" // CShuffleMXdlPerWavePerShuffle
",1" // CShuffleNXdlPerWavePerShuffle
",Seq(1,32,1,8)" // CDEBlockTransferClusterLengths
",8" // CDEBlockTransferScalarPerVector_NPerBlock
",Intrawave" // BlkGemmPipeSched
",v4" // BlkGemmPipelineVer
",fp16" // AComputeDataType
",fp16" // BComputeDataType
",false>"; // DirectLoad
// Test GetInstanceString through base class pointer for V3 variant
TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvV3)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
}
TEST(InstanceString, DescriptionReturnsCorrectValueForFwdGrpConvV3)
{
EXPECT_EQ(ckr::describe<DeviceInstance>().instance_string(), expected_str);
}
} // namespace

View File

@@ -0,0 +1,93 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include <ck_tile/builder/reflect/instance_traits.hpp>
#include <ck_tile/builder/reflect/conv_description.hpp>
#include <ck/tensor_operation/gpu/device/device_base.hpp>
#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp>
namespace {
namespace ckr = ck_tile::reflect;
// Use the template helper to get a working instance configuration
using InstanceTuple =
ck::tensor_operation::device::instance::device_grouped_conv_fwd_wmma_f16_instances<
2, // NDimSpatial
ck::tensor_operation::device::instance::GNHWC, // ALayout
ck::tensor_operation::device::instance::GKYXC, // BLayout
ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
ck::tensor_operation::device::instance::GNHWK, // ELayout
ck::Tuple<>, // DsDatatype
ck::tensor_operation::element_wise::PassThrough, // CDEElementOp
ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvSpec
// Get the first instance from the tuple
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
// Expected complete instance string based on the first instance from
// device_grouped_conv_fwd_wmma_f16_instances
std::string expected_str = "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle"
"<2" // NDimSpatial
",GNHWC" // ALayout
",GKYXC" // BLayout
",EmptyTuple" // DsLayout
",GNHWK" // ELayout
",fp16" // ADataType
",fp16" // BDataType
",fp32" // AccDataType
",fp16" // CShuffleDataType
",EmptyTuple" // DsDataType
",fp16" // EDataType
",PassThrough" // AElementwiseOperation
",PassThrough" // BElementwiseOperation
",PassThrough" // CDEElementwiseOperation
",Default" // ConvForwardSpecialization
",MNKPadding" // GemmSpec
",1" // NumGemmKPrefetchStage
",128" // BlockSize
",64" // MPerBlock
",64" // NPerBlock
",32" // KPerBlock
",8" // K1
",16" // MPerWmma
",16" // NPerWmma
",2" // MRepeat
",2" // NRepeat
",Seq(4,32,1)" // ABlockTransferThreadClusterLengths
",Seq(1,0,2)" // ABlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
",2" // ABlockTransferSrcVectorDim
",1" // ABlockTransferSrcScalarPerVector
",8" // ABlockTransferDstScalarPerVector_AK1
",true" // ABlockLdsExtraM
",Seq(4,32,1)" // BBlockTransferThreadClusterLengths
",Seq(1,0,2)" // BBlockTransferThreadClusterArrangeOrder
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
",2" // BBlockTransferSrcVectorDim
",1" // BBlockTransferSrcScalarPerVector
",8" // BBlockTransferDstScalarPerVector_BK1
",true" // BBlockLdsExtraN
",1" // CShuffleMRepeatPerShuffle
",1" // CShuffleNRepeatPerShuffle
",Seq(1,32,1,4)" // CDEShuffleBlockTransferClusterLengths
",1" // CDEShuffleBlockTransferScalarPerVector_NPerBlock
",Default" // LoopSched
",v1>"; // PipelineVer
// Test GetInstanceString through base class pointer for WMMA variant
TEST(InstanceString, GetInstanceStringReturnsCorrectValueForFwdGrpConvWmma)
{
using BaseClass = ck::tensor_operation::device::BaseOperator;
DeviceInstance device_instance;
BaseClass* base_ptr = &device_instance;
EXPECT_EQ(base_ptr->GetInstanceString(), expected_str);
}
// TODO: Add DescriptionReturnsCorrectValueForFwdGrpConvWmma test once ckr::describe supports WMMA
// variants The WMMA variant uses WMMA-specific parameters (MPerWmma, NPerWmma, MRepeat, NRepeat)
// and has a different tile transfer structure incompatible with current ConvTraits
} // namespace