diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm.hpp index 6558350a1d..55c39765aa 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm.hpp @@ -2,6 +2,7 @@ #include #include +#include namespace ck_tile::builder { @@ -31,7 +32,6 @@ struct ThreadBlock // Size of the submatrix problem in a thread block. MNK sub_matrix; }; - static_assert(ThreadBlockInfo); // Concept to check if struct provides thread block info. @@ -58,7 +58,6 @@ struct ConvTuningParams int m_xdl_per_wave = 0; int n_xdl_per_wave = 0; }; - static_assert(ConvTuningInfo); // Concept to check if a struct provides convolution tuning info. @@ -67,6 +66,77 @@ concept HasConvTuningInfo = requires { { T::tuning_params } -> ConvTuningInfo; }; +// Concept for A block transfer thread cluster lengths. +template +concept BlockATransferLengths = requires(T t) { + { t.k0 } -> std::convertible_to; + { t.m } -> std::convertible_to; + { t.k1 } -> std::convertible_to; +}; + +// Describe A block transfer thread cluster lengths. +struct BlockATransferLengthsInfo +{ + int k0; + int m; + int k1; +}; +static_assert(BlockATransferLengths); + +// Concept for B block transfer thread cluster lengths. +template +concept BlockBTransferLengths = requires(T t) { + { t.k0 } -> std::convertible_to; + { t.n } -> std::convertible_to; + { t.k1 } -> std::convertible_to; +}; + +// Describe B block transfer thread cluster lengths. +struct BlockBTransferLengthsInfo +{ + int k0; + int n; + int k1; +}; +static_assert(BlockBTransferLengths); + +// Concept for C block transfer thread cluster lengths. +template +concept BlockCTransferLengths = requires(T t) { + { t.m_block } -> std::convertible_to; + { t.m_wave_per_xdl } -> std::convertible_to; + { t.n_block } -> std::convertible_to; + { t.n_wave_per_xdl } -> std::convertible_to; +}; + +// Describe C block transfer thread cluster lengths. +struct BlockCTransferLengthsInfo +{ + int m_block; + int m_wave_per_xdl; + int n_block; + int n_wave_per_xdl; +}; +static_assert(BlockBTransferLengths); + +// Concept to check if a struct provides A Block tranfer info. +template +concept HasABlockTransferInfo = requires(T t) { + { T::block_transfer.thread_cluster_lengths_a } -> BlockATransferLengths; +}; + +// Concept to check if a struct provides B Block tranfer info. +template +concept HasBBlockTransferInfo = requires(T t) { + { T::block_transfer.thread_cluster_lengths_b } -> BlockBTransferLengths; +}; + +// Concept to check if a struct provides C Block tranfer info. +template +concept HasCBlockTransferInfo = requires(T t) { + { T::block_transfer.thread_cluster_lengths_c } -> BlockCTransferLengths; +}; + // No requirements yet for a ConvAlogorithm concept. template concept ConvAlgorithm = std::is_class_v; diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 6ffc05ca99..85c0681912 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -129,7 +129,7 @@ constexpr ConvTuning SetConvTuningInfo() }; } -// Block tranfser paramters for A or B tensor. +// Block transfer paramters for A or B tensor. struct BlockTransfer { ck::Array thread_cluster_lengths = {0, 0, 0}; // k0, m, k1 @@ -144,12 +144,79 @@ struct BlockTransfer // Block transfer parameters for C tensor. struct CBlockTransfer { - int m_xdl_per_wave_per_shuffle = 0; - int n_xdl_per_wave_per_shuffle = 0; - ck::Array cluster_lengths = {0, 0, 0, 0}; - int scaler_per_vector = 8; + int m_xdl_per_wave_per_shuffle = 0; + int n_xdl_per_wave_per_shuffle = 0; + ck::Array thread_cluster_lengths = {0, 0, 0, 0}; + int scaler_per_vector = 8; }; +template +constexpr BlockTransfer SetABlockTransfer() +{ + BlockTransfer block_transfer{ + .thread_cluster_lengths = {4, 64, 1}, + .thread_cluster_order = {1, 0, 2}, + .src_access_order = {1, 0, 2}, + .src_vector_dim = 2, + .src_scaler_per_vector = 8, + .dest_scaler_per_vector_k1 = 8, + .add_extra = 0, + }; + using AlgorithmType = decltype(ALGORITHM); + if constexpr(HasABlockTransferInfo) + { + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_lengths_a; + block_transfer.thread_cluster_lengths = {TCL.k0, TCL.m, TCL.k1}; + } + // Default. + return block_transfer; +} + +template +constexpr BlockTransfer SetBBlockTransfer() +{ + BlockTransfer block_transfer{ + .thread_cluster_lengths = {4, 64, 1}, + .thread_cluster_order = {1, 0, 2}, + .src_access_order = {1, 0, 2}, + .src_vector_dim = 2, + .src_scaler_per_vector = 8, + .dest_scaler_per_vector_k1 = 8, + .add_extra = 0, + }; + using AlgorithmType = decltype(ALGORITHM); + if constexpr(HasBBlockTransferInfo) + { + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_lengths_b; + block_transfer.thread_cluster_lengths = {TCL.k0, TCL.n, TCL.k1}; + } + // Default. + return block_transfer; +} + +template +constexpr CBlockTransfer SetCBlockTransfer() +{ + CBlockTransfer block_transfer{ + .m_xdl_per_wave_per_shuffle = 1, + .n_xdl_per_wave_per_shuffle = 1, + .thread_cluster_lengths = {1, 32, 1, 8}, + .scaler_per_vector = 8, + }; + using AlgorithmType = decltype(ALGORITHM); + if constexpr(HasCBlockTransferInfo) + { + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_lengths_c; + block_transfer.thread_cluster_lengths = { + TCL.m_block, + TCL.m_wave_per_xdl, + TCL.n_block, + TCL.n_wave_per_xdl, + }; + } + return block_transfer; +} + // Factory builds an instance of a grouped convolution kernel. template requires SupportedVersion @@ -163,34 +230,13 @@ struct GroupedConvForwardXldCShuffleFactoryV3 .conv_spec = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default, .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, }; - static constexpr ConvBlock BLOCK = SetThreadBlockInfo(); - static constexpr ConvTuning TUNING = SetConvTuningInfo(); - static constexpr BlockTransfer A_BLOCK_TRANSFER{ - .thread_cluster_lengths = {4, 64, 1}, - .thread_cluster_order = {1, 0, 2}, - .src_access_order = {1, 0, 2}, - .src_vector_dim = 2, - .src_scaler_per_vector = 8, - .dest_scaler_per_vector_k1 = 8, - .add_extra = 0, - }; - static constexpr BlockTransfer B_BLOCK_TRANSFER{ - .thread_cluster_lengths = {4, 64, 1}, - .thread_cluster_order = {1, 0, 2}, - .src_access_order = {1, 0, 2}, - .src_vector_dim = 2, - .src_scaler_per_vector = 8, - .dest_scaler_per_vector_k1 = 8, - .add_extra = 0, - }; - static constexpr CBlockTransfer C_BLOCK_TRANSFER{ - .m_xdl_per_wave_per_shuffle = 1, - .n_xdl_per_wave_per_shuffle = 1, - .cluster_lengths = {1, 32, 1, 8}, - .scaler_per_vector = 8, - }; - static constexpr auto PIPELINE_SCHEDULER = ck::BlockGemmPipelineScheduler::Intrawave; - static constexpr auto PIPELINE_VERSION = ck::BlockGemmPipelineVersion::v4; + static constexpr ConvBlock BLOCK = SetThreadBlockInfo(); + static constexpr ConvTuning TUNING = SetConvTuningInfo(); + static constexpr BlockTransfer A_BLOCK_TRANSFER = SetABlockTransfer(); + static constexpr BlockTransfer B_BLOCK_TRANSFER = SetBBlockTransfer(); + static constexpr CBlockTransfer C_BLOCK_TRANSFER = SetCBlockTransfer(); + static constexpr auto PIPELINE_SCHEDULER = ck::BlockGemmPipelineScheduler::Intrawave; + static constexpr auto PIPELINE_VERSION = ck::BlockGemmPipelineVersion::v4; // The convlution kernel class instance. using Instance = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3< // @@ -236,7 +282,7 @@ struct GroupedConvForwardXldCShuffleFactoryV3 B_BLOCK_TRANSFER.add_extra, C_BLOCK_TRANSFER.m_xdl_per_wave_per_shuffle, C_BLOCK_TRANSFER.n_xdl_per_wave_per_shuffle, - ToSequence, + ToSequence, C_BLOCK_TRANSFER.scaler_per_vector, PIPELINE_SCHEDULER, PIPELINE_VERSION>; diff --git a/experimental/builder/test/test_conv_builder.cpp b/experimental/builder/test/test_conv_builder.cpp index 0b512de289..e416f58fa5 100644 --- a/experimental/builder/test/test_conv_builder.cpp +++ b/experimental/builder/test/test_conv_builder.cpp @@ -36,16 +36,31 @@ struct FwdConvAlgorithm { ckb::ThreadBlock thread_block; ckb::ConvTuningParams tuning_params; + struct BlockTransfer + { + ckb::BlockATransferLengthsInfo thread_cluster_lengths_a; + ckb::BlockBTransferLengthsInfo thread_cluster_lengths_b; + ckb::BlockCTransferLengthsInfo thread_cluster_lengths_c; + } block_transfer; }; static_assert(ckb::ConvAlgorithm); static_assert(ckb::HasThreadBlockInfo); static_assert(ckb::HasConvTuningInfo); +static_assert(ckb::HasABlockTransferInfo); +static_assert(ckb::HasBBlockTransferInfo); +static_assert(ckb::HasCBlockTransferInfo); TEST(ConvBuilderTest, TestConvFwdXdlBf16CompInstances2xInstance0) { static constexpr FwdConvAlgorithm algorithm{ .thread_block{.block_size = 256, .sub_matrix = {.m = 256, .n = 128, .k = 64}}, - .tuning_params{.ak1 = 16, .bk1 = 16, .m_xdl_per_wave = 2, .n_xdl_per_wave = 2}}; + .tuning_params{.ak1 = 16, .bk1 = 16, .m_xdl_per_wave = 2, .n_xdl_per_wave = 2}, + .block_transfer{ + .thread_cluster_lengths_a = {.k0 = 4, .m = 64, .k1 = 1}, + .thread_cluster_lengths_b = {.k0 = 4, .n = 64, .k1 = 1}, + .thread_cluster_lengths_c = + {.m_block = 1, .m_wave_per_xdl = 32, .n_block = 1, .n_wave_per_xdl = 8}, + }}; using Builder = ckb::ConvBuilder; EXPECT_EQ( Builder::Instance::TypeString(), @@ -53,12 +68,28 @@ TEST(ConvBuilderTest, TestConvFwdXdlBf16CompInstances2xInstance0) "8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4>"); EXPECT_EQ(Builder::factory::TUNING.ak1, 16); EXPECT_EQ(Builder::factory::TUNING.bk1, 16); + EXPECT_EQ(Builder::factory::A_BLOCK_TRANSFER.thread_cluster_lengths[0], 4); + EXPECT_EQ(Builder::factory::A_BLOCK_TRANSFER.thread_cluster_lengths[1], 64); + EXPECT_EQ(Builder::factory::A_BLOCK_TRANSFER.thread_cluster_lengths[2], 1); + EXPECT_EQ(Builder::factory::B_BLOCK_TRANSFER.thread_cluster_lengths[0], 4); + EXPECT_EQ(Builder::factory::B_BLOCK_TRANSFER.thread_cluster_lengths[1], 64); + EXPECT_EQ(Builder::factory::B_BLOCK_TRANSFER.thread_cluster_lengths[2], 1); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[0], 1); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[1], 32); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[2], 1); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[3], 8); } TEST(ConvBuilderTest, GroupedConvFwdXdlBf16CompInstance0) { static constexpr FwdConvAlgorithm algorithm{ .thread_block{.block_size = 256, .sub_matrix = {.m = 256, .n = 256, .k = 32}}, - .tuning_params{.ak1 = 8, .bk1 = 8, .m_xdl_per_wave = 4, .n_xdl_per_wave = 4}}; + .tuning_params{.ak1 = 8, .bk1 = 8, .m_xdl_per_wave = 4, .n_xdl_per_wave = 4}, + .block_transfer{ + .thread_cluster_lengths_a = {.k0 = 4, .m = 64, .k1 = 1}, + .thread_cluster_lengths_b = {.k0 = 4, .n = 64, .k1 = 1}, + .thread_cluster_lengths_c = + {.m_block = 1, .m_wave_per_xdl = 32, .n_block = 1, .n_wave_per_xdl = 8}, + }}; using Builder = ckb::ConvBuilder; EXPECT_EQ( Builder::Instance::TypeString(), @@ -66,6 +97,16 @@ TEST(ConvBuilderTest, GroupedConvFwdXdlBf16CompInstance0) "8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4>"); EXPECT_EQ(Builder::factory::TUNING.ak1, 8); EXPECT_EQ(Builder::factory::TUNING.bk1, 8); + EXPECT_EQ(Builder::factory::A_BLOCK_TRANSFER.thread_cluster_lengths[0], 4); + EXPECT_EQ(Builder::factory::A_BLOCK_TRANSFER.thread_cluster_lengths[1], 64); + EXPECT_EQ(Builder::factory::A_BLOCK_TRANSFER.thread_cluster_lengths[2], 1); + EXPECT_EQ(Builder::factory::B_BLOCK_TRANSFER.thread_cluster_lengths[0], 4); + EXPECT_EQ(Builder::factory::B_BLOCK_TRANSFER.thread_cluster_lengths[1], 64); + EXPECT_EQ(Builder::factory::B_BLOCK_TRANSFER.thread_cluster_lengths[2], 1); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[0], 1); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[1], 32); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[2], 1); + EXPECT_EQ(Builder::factory::C_BLOCK_TRANSFER.thread_cluster_lengths[3], 8); }; } // namespace