diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index cf8a203562..5946daf21e 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -37,42 +37,6 @@ struct BaseInvoker virtual ~BaseInvoker() {} }; -struct BaseParameters -{ - BaseParameters() = default; - BaseParameters(const BaseParameters&) = default; - BaseParameters& operator=(const BaseParameters&) = default; - - virtual void SetAElementOp(const std::string&) {} - - virtual void SetBElementOp(const std::string&) {} - - virtual void SetCDEElementOp(const std::string&) {} - - virtual void SetDsLayout(const std::string&) {} - - virtual void SetDsDataType(const std::string&) {} - - virtual void SetGemmSpec(const index_t, const index_t, const index_t) {} - - virtual index_t GetGridSize(const index_t, const index_t) - { - return 0; - } - - virtual index_t GetBlockSize() - { - return 0; - } - - virtual std::string GetParametersString() - { - return ""; - } - - virtual ~BaseParameters() {} -}; - struct BaseOperator { BaseOperator() = default; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp index f9db70c83a..9113bb7b74 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp @@ -51,26 +51,6 @@ struct DeviceGemmMultipleD : public BaseOperator CDEElementwiseOperation cde_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; - - virtual std::unique_ptr MakeParametersPointer() - { - return std::make_unique(BaseParameters{}); - } - - virtual index_t GetBlockSize() const - { - return 0; - } - - virtual index_t GetMPerBlock() const - { - return 0; - } - - virtual index_t GetNPerBlock() const - { - return 0; - } }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp index 779e865d32..74da68f88a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp @@ -699,195 +699,6 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD - static std::string GetSequenceString(S s) - { - auto str = std::stringstream(); - str << "ck::Sequence<"; - auto size = s.Size(); - for(int i = 0; i < size; ++i) - { - str << s.At(i); - if(i < size - 1) - str << ","; - } - str << ">"; - return str.str(); - } - - template - static std::string GetTypeString(T) - { - return ""; - } - - template <> - static std::string GetTypeString(float) - { - return "float"; - } - - template <> - static std::string GetTypeString(ck::half_t) - { - return "ck::half_t"; - } - - template <> - static std::string - GetTypeString(tensor_layout::gemm::RowMajor) - { - return "ck::tensor_layout::gemm::RowMajor"; - } - - template <> - static std::string - GetTypeString(tensor_layout::gemm::ColumnMajor) - { - return "ck::tensor_layout::gemm::ColumnMajor"; - } - - template - static std::string GetTupleString(T t) - { - auto str = std::stringstream(); - str << "ck::Tuple<"; - static_for<0, t.Size(), 1>{}([&](auto i) { - str << GetTypeString(t.At(i)); - if(i < t.Size() - 1) - str << ","; - }); - str << ">"; - return str.str(); - } - - template <> - static std::string GetTupleString>(Tuple<>) - { - return "ck::Tuple<>"; - } - - void SetAElementOp(const std::string& s) override { a_elementwise_op = s; } - - void SetBElementOp(const std::string& s) override { b_elementwise_op = s; } - - void SetCDEElementOp(const std::string& s) override { cde_elementwise_op = s; } - - void SetDsLayout(const std::string& s) override { ds_layout = s; } - - void SetDsDataType(const std::string& s) override { ds_data_type = s; } - - void SetGemmSpec(const index_t m, const index_t n, const index_t k) override - { - std::string spec = ""; - if(math::integer_divide_ceil(m, MPerBlock) * MPerBlock - m != 0) - spec += "M"; - if(math::integer_divide_ceil(n, NPerBlock) * NPerBlock - n != 0) - spec += "N"; - if(math::integer_divide_ceil(k, KPerBlock) * KPerBlock - k != 0) - spec += "K"; - if(spec == "") - gemm_spec = "ck::tensor_operation::device::GemmSpecialization::Default"; - else - gemm_spec = "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding"; - } - - index_t GetGridSize(const index_t m, const index_t n) override - { - return math::integer_divide_ceil(m, MPerBlock) * - math::integer_divide_ceil(n, NPerBlock); - } - - index_t GetBlockSize() override { return BlockSize; } - - std::string GetParametersString() override - { - auto str = std::stringstream(); - - std::map LoopSchedToString{ - {LoopScheduler::Default, "ck::LoopScheduler::Default"}, - {LoopScheduler::Interwave, "ck::LoopScheduler::Interwave"}}; - - std::map PipelineVersionToString{ - {PipelineVersion::v1, "ck::PipelineVersion::v1"}, - {PipelineVersion::v2, "ck::PipelineVersion::v2"}}; - - // clang-format off - str << "ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle" - << "<" - << GetTypeString(ALayout{}) << ", " - << GetTypeString(BLayout{}) << ", " - << ds_layout << ", " - << GetTypeString(ELayout{}) << ", " - << GetTypeString(ADataType{}) << ", " - << GetTypeString(BDataType{}) << ", " - << GetTypeString(AccDataType{}) << ", " - << GetTypeString(CShuffleDataType{}) << ", " - << ds_data_type << ", " - << GetTypeString(EDataType{}) << ", " - << a_elementwise_op << ", " - << b_elementwise_op << ", " - << cde_elementwise_op << ", " - << gemm_spec << ", " - << NumGemmKPrefetchStage << ", " - << BlockSize << ", " - << MPerBlock << ", " - << NPerBlock << ", " - << KPerBlock << ", " - << AK1 << ", " - << BK1 << ", " - << MPerXDL << ", " - << NPerXDL << ", " - << MXdlPerWave << ", " - << NXdlPerWave << ", " - << GetSequenceString(ABlockTransferThreadClusterLengths_AK0_M_AK1{}) << ", " - << GetSequenceString(ABlockTransferThreadClusterArrangeOrder{}) << ", " - << GetSequenceString(ABlockTransferSrcAccessOrder{}) << ", " - << ABlockTransferSrcVectorDim << ", " - << ABlockTransferSrcScalarPerVector << ", " - << ABlockTransferDstScalarPerVector_AK1 << ", " - << ABlockLdsExtraM << ", " - << GetSequenceString(BBlockTransferThreadClusterLengths_BK0_N_BK1{}) << ", " - << GetSequenceString(BBlockTransferThreadClusterArrangeOrder{}) << ", " - << GetSequenceString(BBlockTransferSrcAccessOrder{}) << ", " - << BBlockTransferSrcVectorDim << ", " - << BBlockTransferSrcScalarPerVector << ", " - << BBlockTransferDstScalarPerVector_BK1 << ", " - << BBlockLdsExtraN << ", " - << CShuffleMXdlPerWavePerShuffle << ", " - << CShuffleNXdlPerWavePerShuffle << ", " - << GetSequenceString(CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock{}) << ", " - << CDEBlockTransferScalarPerVector_NPerBlock << ", " - << LoopSchedToString[LoopSched] << ", " - << PipelineVersionToString[PipelineVer] - << ">"; - // clang-format on - - return str.str(); - } - - std::string a_elementwise_op = "ck::tensor_operation::element_wise::PassThrough"; - std::string b_elementwise_op = "ck::tensor_operation::element_wise::PassThrough"; - std::string cde_elementwise_op = "ck::tensor_operation::element_wise::PassThrough"; - std::string ds_layout = "ck::Tuple<>"; - std::string ds_data_type = "ck::Tuple<>"; - std::string gemm_spec = "ck::tensor_operation::device::GemmSpecialization::" + - getGemmSpecializationString(GemmSpec); - }; - - std::unique_ptr MakeParametersPointer() override - { - return std::make_unique(Parameters{}); - } - - index_t GetBlockSize() const override { return BlockSize; } - - index_t GetMPerBlock() const override { return MPerBlock; } - - index_t GetNPerBlock() const override { return NPerBlock; } - template struct Descriptor { diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index cd4842dfa9..ceb2b665b9 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -4,7 +4,7 @@ #pragma once #include "ck/utility/data_type.hpp" -#include "ck/utility/math.hpp" +#include "ck/utility/math_v2.hpp" #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/quantization_operation.hpp" diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index e9a1dccbc7..c206c4dc04 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -21,6 +21,7 @@ ENDFOREACH() add_library(device_operations STATIC ${CK_DEVICE_INSTANCES}) add_library(composablekernels::device_operations ALIAS device_operations) + set(DEV_OPS_INC_DIRS ${PROJECT_SOURCE_DIR}/include/ck/ ${PROJECT_SOURCE_DIR}/library/include/ck/ @@ -55,7 +56,7 @@ target_compile_options(device_operations PRIVATE ) # install(TARGETS device_operations LIBRARY DESTINATION lib) -rocm_install(TARGETS device_operations +rocm_install(TARGETS device_operations EXPORT device_operationsTargets) rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck) diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/gemm_add_add_fastgelu_instances.hpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/gemm_add_add_fastgelu_instances.hpp deleted file mode 100644 index b425abbb92..0000000000 --- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/gemm_add_add_fastgelu_instances.hpp +++ /dev/null @@ -1,252 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -struct gemm_add_add_fastgelu_instances -{ - static inline std::vector device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance = - { - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>" - }; - - static inline std::vector device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance = - { - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>" - }; - - static inline std::vector device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance = - { - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>" - }; - - static inline std::vector device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance = - { - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>", - "DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>" - }; - - static auto get_col_row_instances() - { - return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance; - } - - static auto get_col_col_instances() - { - return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance; - } - - static auto get_row_row_instances() - { - return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance; - } - - static auto get_row_col_instances() - { - return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance; - } -}; - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck