|
|
|
|
@@ -1,252 +0,0 @@
|
|
|
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
|
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include <cstdlib>
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include <memory>
|
|
|
|
|
|
|
|
|
|
namespace ck {
|
|
|
|
|
namespace tensor_operation {
|
|
|
|
|
namespace device {
|
|
|
|
|
namespace instance {
|
|
|
|
|
|
|
|
|
|
struct gemm_add_add_fastgelu_instances
|
|
|
|
|
{
|
|
|
|
|
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance =
|
|
|
|
|
{
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance =
|
|
|
|
|
{
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance =
|
|
|
|
|
{
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance =
|
|
|
|
|
{
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
|
|
|
|
|
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static auto get_col_row_instances()
|
|
|
|
|
{
|
|
|
|
|
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static auto get_col_col_instances()
|
|
|
|
|
{
|
|
|
|
|
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static auto get_row_row_instances()
|
|
|
|
|
{
|
|
|
|
|
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static auto get_row_col_instances()
|
|
|
|
|
{
|
|
|
|
|
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
} // namespace instance
|
|
|
|
|
} // namespace device
|
|
|
|
|
} // namespace tensor_operation
|
|
|
|
|
} // namespace ck
|