mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-27 00:14:35 +00:00
[CK_BUILDER] Instance traits for conv bwd weight algorithms (#3498)
Added instance traits for the following bwd weight conv algorithms
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
DeviceGroupedConvBwdWeight_Wmma_CShuffleV3
DeviceGroupedConvBwdWeight_Wmma_CShuffle
DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle
DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3
DeviceGroupedConvBwdWeight_DL
DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3
Added also unit tests for instance traits of those bwd weigth algorithms that are currently exposed by the narrow CK build for MIOpen.
---------
Co-authored-by: Ville Pietilä <>
[ROCm/composable_kernel commit: 6e8c401e33]
This commit is contained in:
@@ -119,13 +119,29 @@ add_ck_builder_test(test_ckb_conv_builder
|
||||
# - Group convolution (v3, standard, large tensor, WMMA, DL variants)
|
||||
# - Backward weight group convolution (XDL)
|
||||
# Requires kernel compilation to validate the generated strings through the base class.
|
||||
add_ck_builder_test(test_ckb_instance_string
|
||||
|
||||
set(INSTANCE_STRING_TESTS
|
||||
test_instance_string_fwd_grp_conv_v3.cpp
|
||||
test_instance_string_fwd_grp_conv.cpp
|
||||
test_instance_string_fwd_grp_conv_large_tensor.cpp
|
||||
test_instance_string_fwd_grp_conv_wmma.cpp
|
||||
test_instance_string_fwd_grp_conv_dl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_xdl.cpp)
|
||||
test_instance_string_bwd_weight_grp_conv_xdl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_dl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_multiple_d_xdl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_two_stage_xdl.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_xdl_v3.cpp
|
||||
)
|
||||
|
||||
if (CK_USE_WMMA)
|
||||
list(APPEND INSTANCE_STRING_TESTS
|
||||
test_instance_string_bwd_weight_grp_conv_wmma_v3.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_multiple_d_wmma_v3.cpp
|
||||
test_instance_string_bwd_weight_grp_conv_two_stage_wmma_v3.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
add_ck_builder_test(test_ckb_instance_string ${INSTANCE_STRING_TESTS})
|
||||
|
||||
# Tests the forward convolution builder across multiple data types and dimensions.
|
||||
# Individual tests are split into separate files to enable parallel compilation.
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from device_grouped_conv_bwd_weight_dl_f16_instances
|
||||
using InstanceTuple =
|
||||
ck::tensor_operation::device::instance::device_grouped_conv_bwd_weight_dl_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the generic instance
|
||||
std::string expected_str = "DeviceGroupedConvBwdWeight_Dl"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",256" // BlockSize
|
||||
",128" // MPerBlock
|
||||
",128" // NPerBlock
|
||||
",16" // K0PerBlock
|
||||
",1" // K1
|
||||
",4" // M1PerThread
|
||||
",4" // N1PerThread
|
||||
",1" // KPerThread
|
||||
",Seq(8,2)" // M1N1ThreadClusterM1Xs
|
||||
",Seq(8,2)" // M1N1ThreadClusterN1Xs
|
||||
",Seq(1,8,1,1,1)" // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
",Seq(1,2,1,128,1)" // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
",Seq(0,2,3,1,4)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,2,3,1,4)" // ABlockTransferSrcAccessOrder
|
||||
",Seq(1,1,1,1,1)" // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
",Seq(0,2,3,1,4)" // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
",Seq(1,1,1,1,1)" // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
",Seq(1,1,1,8,1)" // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
",Seq(1,16,1,16,1)" // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
",Seq(0,1,4,2,3)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,1,4,2,3)" // BBlockTransferSrcAccessOrder
|
||||
",Seq(1,1,1,1,1)" // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
",Seq(0,1,4,2,3)" // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
",Seq(1,1,1,1,1)" // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
",Seq(0,1,2,3,4,5)" // CThreadTransferSrcDstAccessOrder
|
||||
",5" // CThreadTransferSrcDstVectorDim
|
||||
",1" // CThreadTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvDl)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,86 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the generic instance
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",EmptyTuple" // DsLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",EmptyTuple" // DsDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",Scale" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",64" // MPerBlock
|
||||
",64" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",4" // MRepeat
|
||||
",2" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_AK1
|
||||
",true" // ABlockLdsAddExtraM
|
||||
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_BK1
|
||||
",true" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,16,1,4)" // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvMultipleDWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,84 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the generic instance
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",EmptyTuple" // DsLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",EmptyTuple" // DsDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",Scale" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",64" // MPerBlock
|
||||
",64" // NPerBlock
|
||||
",4" // K0PerBlock
|
||||
",8" // K1
|
||||
",32" // MPerXDL
|
||||
",32" // NPerXDL
|
||||
",2" // MXdlPerWave
|
||||
",2" // NXdlPerWave
|
||||
",Seq(1,4,8,2)" // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
",Seq(0,3,1,2)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,2,1,3)" // ABlockTransferSrcAccessOrder
|
||||
",2" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_K1
|
||||
",true" // ABlockLdsAddExtraM
|
||||
",Seq(1,4,8,2)" // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
",Seq(0,3,1,2)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(0,2,1,3)" // BBlockTransferSrcAccessOrder
|
||||
",2" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_K1
|
||||
",true" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMXdlPerWavePerShuffle
|
||||
",1" // CShuffleNXdlPerWavePerShuffle
|
||||
",Seq(1,16,1,4)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvMultipleDXdl)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from
|
||||
// device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default,
|
||||
ck::BlockGemmPipelineScheduler::Intrawave,
|
||||
ck::BlockGemmPipelineVersion::v1>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the first instance (BlockSize=32, MPerBlock=16, NPerBlock=16, etc.)
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",32" // BlockSize
|
||||
",16" // MPerBlock
|
||||
",16" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",1" // MRepeat
|
||||
",1" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",1" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_AK1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",1" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_BK1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,4,1,8)" // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",1" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",1" // NumGroupsToMerge
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // TransposeTransferSrcScalarPerVector
|
||||
",1" // TransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvTwoStageWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
// Use the first instance from
|
||||
// device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_layout::convolution::GNHWC, // ALayout (InLayout)
|
||||
ck::tensor_layout::convolution::GKYXC, // BLayout (WeiLayout)
|
||||
ck::tensor_layout::convolution::GNHWK, // ELayout (OutLayout)
|
||||
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default,
|
||||
ck::BlockGemmPipelineScheduler::Intrawave,
|
||||
ck::BlockGemmPipelineVersion::v1>;
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected string based on the first instance in the tuple
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",16" // MPerBlock
|
||||
",16" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // K1
|
||||
",16" // MPerXDL
|
||||
",16" // NPerXDL
|
||||
",1" // MXdlPerWave
|
||||
",1" // NXdlPerWave
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",1" // ABlockTransferSrcScalarPerVector
|
||||
",4" // ABlockTransferDstScalarPerVector_K1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,8,1)" // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",1" // BBlockTransferSrcScalarPerVector
|
||||
",4" // BBlockTransferDstScalarPerVector_K1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMXdlPerWavePerShuffle
|
||||
",1" // CShuffleNXdlPerWavePerShuffle
|
||||
",Seq(1,8,1,8)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",1" // CBlockTransferScalarPerVector_NWaveNPerXdl
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",1" // NumGroupsToMerge
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // TransposeTransferSrcScalarPerVector
|
||||
",1" // TransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvTwoStageXdl)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// #ifdef _NOT_DEFINED_
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_wmma_c_shuffle_bf16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_operation::device::instance::NHWGC, // InLayout
|
||||
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
|
||||
ck::tensor_operation::device::instance::NHWGK, // OutLayout
|
||||
ck::tensor_operation::device::instance::ConvBwdWeightDefault>;
|
||||
|
||||
// Expected complete instance string
|
||||
std::string expected_str = "DeviceGroupedConvBwdWeight_Wmma_CShuffle"
|
||||
"<2" // NDimSpatial
|
||||
",NHWGC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",NHWGK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",32" // MPerBlock
|
||||
",32" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",2" // MRepeat
|
||||
",1" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",2" // ABlockTransferDstScalarPerVector_AK1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",2" // BBlockTransferDstScalarPerVector_BK1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,8,1,8)" // CShuffleBlockTransferClusterLengths
|
||||
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // MaxTransposeTransferSrcScalarPerVector
|
||||
",1" // MaxTransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
// Get the first instance from the tuple
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Test describe() through base class pointer for WMMA V3 variant
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// #endif
|
||||
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// #ifdef _NOT_DEFINED_
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_v3_wmma_c_shuffle_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_operation::device::instance::NHWGC, // InLayout
|
||||
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
|
||||
ck::tensor_operation::device::instance::NHWGK, // OutLayout
|
||||
ck::tensor_operation::device::instance::ConvBwdWeightDefault>;
|
||||
|
||||
// Expected complete instance string
|
||||
std::string expected_str = "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",NHWGC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",NHWGK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",32" // MPerBlock
|
||||
",32" // NPerBlock
|
||||
",32" // KPerBlock
|
||||
",8" // ABK1
|
||||
",16" // MPerWmma
|
||||
",16" // NPerWmma
|
||||
",2" // MRepeat
|
||||
",1" // NRepeat
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",2" // ABlockTransferDstScalarPerVector_AK1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",2" // BBlockTransferDstScalarPerVector_BK1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMRepeatPerShuffle
|
||||
",1" // CShuffleNRepeatPerShuffle
|
||||
",Seq(1,8,1,8)" // CShuffleBlockTransferClusterLengths
|
||||
",2" // CShuffleBlockTransferScalarPerVector_NPerBlock
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
",1" // MaxTransposeTransferSrcScalarPerVector
|
||||
",1" // MaxTransposeTransferDstScalarPerVector
|
||||
">";
|
||||
|
||||
// Get the first instance from the tuple
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Test describe() through base class pointer for WMMA V3 variant
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvWmmaV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// #endif
|
||||
@@ -0,0 +1,86 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ck_tile/builder/reflect/instance_traits.hpp"
|
||||
#include "ck_tile/builder/reflect/conv_description.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace ckr = ck_tile::reflect;
|
||||
|
||||
using InstanceTuple = ck::tensor_operation::device::instance::
|
||||
device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
|
||||
2, // NDimSpatial
|
||||
ck::tensor_operation::device::instance::GNHWC, // InLayout
|
||||
ck::tensor_operation::device::instance::GKYXC, // WeiLayout
|
||||
ck::tensor_operation::device::instance::GNHWK, // OutLayout
|
||||
ck::tensor_operation::device::instance::ConvBwdWeightDefault, // ConvBwdWeightSpecialization
|
||||
ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
|
||||
ck::BlockGemmPipelineVersion::v1>; // BlkGemmPipelineVer
|
||||
|
||||
using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
|
||||
|
||||
// Expected complete instance string based on the generic instance
|
||||
std::string expected_str =
|
||||
"DeviceGroupedConvBwdWeight_Xdl_CShuffleV3"
|
||||
"<2" // NDimSpatial
|
||||
",GNHWC" // InLayout
|
||||
",GKYXC" // WeiLayout
|
||||
",GNHWK" // OutLayout
|
||||
",fp16" // InDataType
|
||||
",fp16" // WeiDataType
|
||||
",fp16" // OutDataType
|
||||
",fp32" // AccDataType
|
||||
",PassThrough" // InElementwiseOperation
|
||||
",PassThrough" // WeiElementwiseOperation
|
||||
",PassThrough" // OutElementwiseOperation
|
||||
",Default" // ConvBackwardWeightSpecialization
|
||||
",64" // BlockSize
|
||||
",32" // MPerBlock
|
||||
",32" // NPerBlock
|
||||
",32" // K0PerBlock
|
||||
",8" // K1
|
||||
",32" // MPerXDL
|
||||
",32" // NPerXDL
|
||||
",1" // MXdlPerWave
|
||||
",1" // NXdlPerWave
|
||||
",Seq(4,8,1)" // ABlockTransferThreadClusterLengths_K0_M_K1
|
||||
",Seq(2,0,1)" // ABlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // ABlockTransferSrcAccessOrder
|
||||
",1" // ABlockTransferSrcVectorDim
|
||||
",2" // ABlockTransferSrcScalarPerVector
|
||||
",2" // ABlockTransferDstScalarPerVector_K1
|
||||
",false" // ABlockLdsAddExtraM
|
||||
",Seq(4,16,1)" // BBlockTransferThreadClusterLengths_K0_N_K1
|
||||
",Seq(2,0,1)" // BBlockTransferThreadClusterArrangeOrder
|
||||
",Seq(1,0,2)" // BBlockTransferSrcAccessOrder
|
||||
",1" // BBlockTransferSrcVectorDim
|
||||
",2" // BBlockTransferSrcScalarPerVector
|
||||
",2" // BBlockTransferDstScalarPerVector_K1
|
||||
",false" // BBlockLdsAddExtraN
|
||||
",1" // CShuffleMXdlPerWavePerShuffle
|
||||
",1" // CShuffleNXdlPerWavePerShuffle
|
||||
",Seq(1,8,1,8)" // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
|
||||
",2" // CBlockTransferScalarPerVector_NWaveNPerXdl
|
||||
",Intrawave" // BlkGemmPipeSched
|
||||
",v1" // BlkGemmPipelineVer
|
||||
",fp16" // ComputeTypeA
|
||||
",fp16" // ComputeTypeB
|
||||
">";
|
||||
|
||||
// Test describe() through base class pointer for XDL V3 variant
|
||||
TEST(InstanceString, DescribeReturnsCorrectValueForBwdWeightGrpConvXdlV3)
|
||||
{
|
||||
using BaseClass = ck::tensor_operation::device::BaseOperator;
|
||||
DeviceInstance device_instance;
|
||||
BaseClass* base_ptr = &device_instance;
|
||||
|
||||
auto desc = base_ptr->describe();
|
||||
ASSERT_NE(desc, nullptr);
|
||||
EXPECT_EQ(desc->instance_string(), expected_str);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
Reference in New Issue
Block a user