From 050654984729ddc9fb972f134fe3b09257d1cbbf Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Tue, 28 Oct 2025 18:27:42 +0100 Subject: [PATCH] [CK_BUILDER] Factory tests (#3071) This pull requests adds some initial "factory tests" - these check that the instances which are used in MIOpen are actually present in CK. The main reason for this is documentation and sanity checking. Its likely that these tests get outdated fast, so we'll have to maintain them, but fortunately this is quite straight forward and shouldn't take a lot of time once they are in place. [ROCm/composable_kernel commit: 6f58d6e4577b210625ad71a488864bcc1903fd93] --- experimental/builder/test/CMakeLists.txt | 25 +- ...ck_factory_grouped_convolution_forward.cpp | 2345 +++++++++++++++++ ...d_convolution_forward_bias_bnorm_clamp.cpp | 1272 +++++++++ ...grouped_convolution_forward_bias_clamp.cpp | 1269 +++++++++ ...y_grouped_convolution_forward_bilinear.cpp | 118 + ...tory_grouped_convolution_forward_clamp.cpp | 1249 +++++++++ ..._grouped_convolution_forward_convscale.cpp | 246 ++ ...grouped_convolution_forward_dynamic_op.cpp | 187 ++ ...tory_grouped_convolution_forward_scale.cpp | 115 + ...rouped_convolution_forward_scaleadd_ab.cpp | 104 + ...olution_forward_scaleadd_scaleadd_relu.cpp | 105 + .../builder/test/test_testing_utils.cpp | 98 + experimental/builder/test/testing_utils.cpp | 101 +- experimental/builder/test/testing_utils.hpp | 69 + .../gpu/grouped_convolution_forward.hpp | 2 +- ...grouped_convolution_forward_dynamic_op.hpp | 4 +- .../gpu/grouped_convolution_forward_scale.hpp | 2 +- script/remove_exec_bit.sh | 2 +- 18 files changed, 7295 insertions(+), 18 deletions(-) create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp create mode 100644 experimental/builder/test/test_testing_utils.cpp diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 1dc508a0c3..bafa95862a 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -2,7 +2,7 @@ include(gtest) # Helper function to create a gtest executable with common properties function(add_ck_builder_test test_name) - add_executable(${test_name} ${ARGN}) + add_executable(${test_name} ${ARGN} testing_utils.cpp) target_compile_features(${test_name} PRIVATE cxx_std_20) target_include_directories(${test_name} PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/include" @@ -20,9 +20,28 @@ endfunction() add_ck_builder_test(test_conv_builder test_conv_builder.cpp test_instance_traits.cpp - test_instance_traits_util.cpp - testing_utils.cpp) + test_instance_traits_util.cpp) # Testing the virtual GetInstanceString methods requires kernel compilation. add_ck_builder_test(test_get_instance_string test_get_instance_string.cpp) + +add_ck_builder_test(test_inline_diff test_inline_diff.cpp) + +function(add_ck_factory_test test_name) + add_ck_builder_test(${test_name} ${ARGN}) + target_link_libraries(${test_name} PRIVATE composablekernels::device_conv_operations) +endfunction() + +add_ck_factory_test(test_testing_utils test_testing_utils.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward test_ck_factory_grouped_convolution_forward.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_clamp test_ck_factory_grouped_convolution_forward_clamp.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_convscale test_ck_factory_grouped_convolution_forward_convscale.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_bilinear test_ck_factory_grouped_convolution_forward_bilinear.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_scale test_ck_factory_grouped_convolution_forward_scale.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_scaleadd_ab test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_bias_clamp test_ck_factory_grouped_convolution_forward_bias_clamp.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_dynamic_op test_ck_factory_grouped_convolution_forward_dynamic_op.cpp) + diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp new file mode 100644 index 0000000000..53199f1d77 --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp @@ -0,0 +1,2345 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +using namespace ck::tensor_layout::convolution; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::PassThrough; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + PassThrough, + AComputeType, + BComputeType>; + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +// 1D kernels - layout GNWC/GKXC/GNWK + +struct F32_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 2D kernels - layout GNHWC/GKYXC/GNHWK + +struct F32_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, int8_t>; + + constexpr static std::initializer_list expected = { + // clang-format off + // clang-format on + }; +}; + +// 2D kernels - layout NHWGC/GKYXC/NHWGK + +struct F32_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, float>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "" + // clang-format on + }; +}; + +struct F16_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "" + // clang-format on + }; +}; + +struct BF16_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "" + // clang-format on + }; +}; + +struct S8_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "" + // clang-format on + }; +}; + +// 2D kernels - layout NGCHW/GKCYX/NGKHW + +struct F32_2D_NGCHW_GKCYX_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKCYX, NGKHW, float>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "" + // clang-format on + }; +}; + +struct F16_2D_NGCHW_GKCYX_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKCYX, NGKHW, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "" + // clang-format on + }; +}; + +struct BF16_2DNGCHW_GKCYX_NGKHWK +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKCYX, NGKHW, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "" + // clang-format on + }; +}; + +// 2D kernels - layout NGCHW/GKYXC/NGKHW + +struct F32_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 3D - layout GNDHWC/GKZYXC/GNDHWK + +struct F32_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 3D - layout NDHWGC/GKZYXC/NDHWGK + +struct F32_3D_NDHWGC_GKZYXC_NDHWGK_TF32 +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, float, float, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F32_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, float>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "" + // clang-format on + }; +}; + +struct F16_3D_NDHWGC_GKZYXC_NDHWGK_F8 +{ + using DeviceOp = + ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::half_t, ck::half_t, ck::half_t, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::bf8_t, ck::bf8_t, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_BF8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = + ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::f8_t, ck::bf8_t, ck::f8_t, ck::f8_t, ck::bf8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF8_F8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = + ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::bf8_t, ck::f8_t, ck::f8_t, ck::bf8_t, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +struct S8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 3D - layout NGCDHW/GKCZYX/NGKDHW + +struct F32_3D_NGCDHW_GKCZYX_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NGCDHW, GKCZYX, NGKDHW, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F16_3D_NGCDHW_GKCZYX_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NGCDHW, GKCZYX, NGKDHW, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_3D_NGCDHW_GKCZYX_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NGCDHW, GKCZYX, NGKDHW, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp new file mode 100644 index 0000000000..c68fdb4b24 --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp @@ -0,0 +1,1272 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +using InLayout = ck::tensor_layout::convolution::NHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::NHWGK; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using ck::tensor_operation::element_wise::PassThrough; + +template +using Quintet = ck::Tuple; + +template +struct DeviceOpHelper; + +template +struct DeviceOpHelper<2, T, ComputeType> +{ + using InLayout = ck::tensor_layout::convolution::NHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKYXC; + using OutLayout = ck::tensor_layout::convolution::NHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<2, + InLayout, + WeiLayout, + Quintet, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + Quintet, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + BiasNormalizeInInferClamp, + ComputeType>; +}; + +template +struct DeviceOpHelper<3, T, ComputeType> +{ + using InLayout = ck::tensor_layout::convolution::NDHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKZYXC; + using OutLayout = ck::tensor_layout::convolution::NDHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<3, + InLayout, + WeiLayout, + Quintet, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + Quintet, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + BiasNormalizeInInferClamp, + ComputeType>; +}; + +template +using DeviceOp = DeviceOpHelper::Type; + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F32_2D +{ + using DeviceOp = ::DeviceOp<2, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_TF32_2D +{ + using DeviceOp = ::DeviceOp<2, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +struct F32_3D +{ + using DeviceOp = ::DeviceOp<3, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_TF32_3D +{ + using DeviceOp = ::DeviceOp<3, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +using TestTypes = + ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp new file mode 100644 index 0000000000..a9e5e39e8f --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp @@ -0,0 +1,1269 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +using InLayout = ck::tensor_layout::convolution::NHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::NHWGK; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::AddClamp; +using ck::tensor_operation::element_wise::PassThrough; + +template +struct DeviceOpHelper; + +template +struct DeviceOpHelper<2, T, ComputeType> +{ + using InLayout = ck::tensor_layout::convolution::NHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKYXC; + using OutLayout = ck::tensor_layout::convolution::NHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<2, + InLayout, + WeiLayout, + ck::Tuple, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + AddClamp, + ComputeType>; +}; + +template +struct DeviceOpHelper<3, T, ComputeType> +{ + using InLayout = ck::tensor_layout::convolution::NDHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKZYXC; + using OutLayout = ck::tensor_layout::convolution::NDHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<3, + InLayout, + WeiLayout, + ck::Tuple, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + AddClamp, + ComputeType>; +}; + +template +using DeviceOp = DeviceOpHelper::Type; + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F32_2D +{ + using DeviceOp = ::DeviceOp<2, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_TF32_2D +{ + using DeviceOp = ::DeviceOp<2, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +struct F32_3D +{ + using DeviceOp = ::DeviceOp<3, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_TF32_3D +{ + using DeviceOp = ::DeviceOp<3, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +using TestTypes = + ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp new file mode 100644 index 0000000000..cc9cbd3dda --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "ck/utility/data_type.hpp" +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +constexpr static auto NumDimSpatial = 3; +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; +using DsLayout = ck::Tuple; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::Bilinear; +using ck::tensor_operation::element_wise::PassThrough; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD, + type, // OutDataType + PassThrough, + PassThrough, + Bilinear, + computeType, + computeType>; + +} // namespace + +template +struct CkFactoryTestBilinearFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct Bilinear_F32 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct Bilinear_F32_TF32 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct Bilinear_F16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct Bilinear_BF16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct Bilinear_INT8 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +using TestTypes = + ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestBilinearFwd, TestTypes); + +TYPED_TEST(CkFactoryTestBilinearFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp new file mode 100644 index 0000000000..a8d1d1763f --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp @@ -0,0 +1,1249 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "ck/utility/data_type.hpp" +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; +using DsLayout = ck::Tuple<>; + +using ck::tensor_layout::convolution::GKYXC; +using ck::tensor_layout::convolution::GKZYXC; +using ck::tensor_layout::convolution::NDHWGC; +using ck::tensor_layout::convolution::NDHWGK; +using ck::tensor_layout::convolution::NHWGC; +using ck::tensor_layout::convolution::NHWGK; + +using DsLayout = ck::Tuple<>; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::Clamp; +using ck::tensor_operation::element_wise::PassThrough; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + Clamp, + ComputeType>; + +} // namespace + +template +struct CkFactoryTestBilinearFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F32_2D +{ + using DeviceOp = ::DeviceOp<2, float, NHWGC, GKYXC, NHWGK>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_3D +{ + using DeviceOp = ::DeviceOp<3, float, NDHWGC, GKZYXC, NDHWGK>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_TF32_2D +{ + using DeviceOp = ::DeviceOp<2, float, NHWGC, GKYXC, NHWGK, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F32_TF32_3D +{ + using DeviceOp = ::DeviceOp<3, float, NDHWGC, GKZYXC, NDHWGK, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::half_t, NHWGC, GKYXC, NHWGK>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct F16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::half_t, NDHWGC, GKZYXC, NDHWGK>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::bhalf_t, NHWGC, GKYXC, NHWGK>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +struct BF16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +using TestTypes = + ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestBilinearFwd, TestTypes); + +TYPED_TEST(CkFactoryTestBilinearFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp new file mode 100644 index 0000000000..0da300e3d8 --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +constexpr static auto NumDimSpatial = 3; +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::device::instance::CombConvScale; +using ck::tensor_operation::device::instance::CombConvScaleRelu; +using ck::tensor_operation::element_wise::ConvInvscale; +using ck::tensor_operation::element_wise::ConvScale; +using ck::tensor_operation::element_wise::ConvScaleAdd; +using ck::tensor_operation::element_wise::ConvScaleRelu; +using ck::tensor_operation::element_wise::PassThrough; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD; + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F8_ConvScale +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::f8_t, + ck::f8_t, + ck::f8_t, + ConvScale, + ck::f8_t, + ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_BF8_comb1_ConvScale +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::bf8_t, + ck::bf8_t, + ck::f8_t, + ConvScale, + ck::bf8_t, + ck::bf8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_BF8_comb2_ConvScale +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::f8_t, + ck::bf8_t, + ck::f8_t, + ConvScale, + ck::f8_t, + ck::bf8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_BF8_comb3_ConvScale +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::bf8_t, + ck::f8_t, + ck::f8_t, + ConvScale, + ck::bf8_t, + ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_float_CombConvScale +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::f8_t, + ck::f8_t, + float, + CombConvScale, + ck::f8_t, + ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_ConvScaleRelu +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::f8_t, + ck::f8_t, + ck::f8_t, + ConvScaleRelu, + ck::f8_t, + ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_CombConvScaleRelu +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::f8_t, + ck::f8_t, + float, + CombConvScaleRelu, + ck::f8_t, + ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_ConvScaleAdd +{ + using DeviceOp = ::DeviceOp, + ck::Tuple, + ck::f8_t, + ck::f8_t, + ck::f8_t, + ConvScaleAdd, + ck::f8_t, + ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_ConvInvscale +{ + using DeviceOp = ::DeviceOp, + ck::Tuple<>, + ck::f8_t, + ck::f8_t, + ck::f8_t, + ConvInvscale, + ck::f8_t, + ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp new file mode 100644 index 0000000000..e918785fd7 --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "ck/utility/data_type.hpp" +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::DynamicUnaryOp; +using ck::tensor_operation::element_wise::PassThrough; + +template +struct DeviceOpHelper; + +template +struct DeviceOpHelper<2, T> +{ + using InLayout = ck::tensor_layout::convolution::NHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKYXC; + using OutLayout = ck::tensor_layout::convolution::NHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<2, + InLayout, + WeiLayout, + ck::Tuple<>, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple<>, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + DynamicUnaryOp>; +}; + +template +struct DeviceOpHelper<3, T> +{ + using InLayout = ck::tensor_layout::convolution::NDHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKZYXC; + using OutLayout = ck::tensor_layout::convolution::NDHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<3, + InLayout, + WeiLayout, + ck::Tuple<>, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple<>, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + DynamicUnaryOp>; +}; + +template +using DeviceOp = DeviceOpHelper::Type; + +} // namespace + +template +struct CkFactoryTestBilinearFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct DyOp_F32_2 +{ + using DeviceOp = ::DeviceOp<2, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct DyOp_F32_3 +{ + using DeviceOp = ::DeviceOp<3, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct DyOp_F16_2 +{ + using DeviceOp = ::DeviceOp<2, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct DyOp_F16_3 +{ + using DeviceOp = ::DeviceOp<3, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct DyOp_BF16_2 +{ + using DeviceOp = ::DeviceOp<2, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct DyOp_BF16_3 +{ + using DeviceOp = ::DeviceOp<3, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct DyOp_INT8_2 +{ + using DeviceOp = ::DeviceOp<2, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct DyOp_INT8_3 +{ + using DeviceOp = ::DeviceOp<3, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestBilinearFwd, TestTypes); + +TYPED_TEST(CkFactoryTestBilinearFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp new file mode 100644 index 0000000000..428d1c81f3 --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +constexpr static auto NumDimSpatial = 3; +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::PassThrough; +using ck::tensor_operation::element_wise::Scale; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple<>, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + Scale, + ComputeType>; + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F32 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F32_TF32 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp new file mode 100644 index 0000000000..774c30c05e --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +constexpr static auto NumDimSpatial = 3; +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::PassThrough; +using ck::tensor_operation::element_wise::ScaleAdd; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD, // DsLayout + OutLayout, + ck::Tuple, // InDataType + ck::Tuple, // WeiDataType + ck::Tuple<>, // DsDataType + T, // OutDataType + ScaleAdd, + ScaleAdd, + PassThrough, + T>; // ComputeType + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F32 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp new file mode 100644 index 0000000000..ba8726a643 --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +constexpr static auto NumDimSpatial = 3; +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +using ck::tensor_layout::convolution::G_K; +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::PassThrough; +using ck::tensor_operation::element_wise::ScaleAddScaleAddRelu; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + ScaleAddScaleAddRelu, + T>; // ComputeType + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F32 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8 +{ + using DeviceOp = ::DeviceOp; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/test_testing_utils.cpp b/experimental/builder/test/test_testing_utils.cpp new file mode 100644 index 0000000000..24a1c9bc81 --- /dev/null +++ b/experimental/builder/test/test_testing_utils.cpp @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceMatcher; +using ck_tile::test::InstanceSet; +using ck_tile::test::StringEqWithDiff; + +TEST(InstanceSet, FromFactory) +{ + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< + 2, // NDimSpatial + ck::tensor_operation::device::instance::NHWGC, // InLayout + ck::tensor_operation::device::instance::GKYXC, // WeiLayout + ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout + ck::tensor_operation::device::instance::NHWGK, // OutLayout + ck::half_t, // ADataType + ck::half_t, // BDataType + ck::Tuple<>, // DsDataType + ck::half_t, // EDataType + ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation + ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation + ck::half_t, // AComputeType + ck::half_t>; // BComputeType + + const auto instances = InstanceSet::from_factory(); + + EXPECT_THAT(instances.instances.size(), testing::Gt(0)); + + const auto* el = + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16," + "fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64," + "8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2," + "8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>"; + EXPECT_THAT(instances.instances, testing::Contains(el)); +} + +TEST(InstanceMatcher, Basic) +{ + auto a = InstanceSet{ + "python", + "cobra", + "boa", + }; + + auto b = InstanceSet{ + "cobra", + "boa", + "python", + }; + + auto c = InstanceSet{ + "adder", + "boa", + "cobra", + }; + + auto d = InstanceSet{ + "boa", + "python", + }; + + EXPECT_THAT(a, InstancesMatch(b)); + EXPECT_THAT(c, Not(InstancesMatch(b))); + EXPECT_THAT(a, Not(InstancesMatch(d))); + EXPECT_THAT(d, Not(InstancesMatch(b))); +} + +TEST(InstanceMatcher, ExplainMatchResult) +{ + auto actual = InstanceSet{ + "python", + "cobra", + "boa", + }; + + auto expected = InstanceSet{ + "adder", + "boa", + "cobra", + "rattlesnake", + }; + + testing::StringMatchResultListener listener; + EXPECT_TRUE(!ExplainMatchResult(InstancesMatch(expected), actual, &listener)); + + EXPECT_THAT(listener.str(), + StringEqWithDiff("\n" + " Missing: 2\n" + "- adder\n" + "- rattlesnake\n" + "Unexpected: 1\n" + "- python\n")); +} diff --git a/experimental/builder/test/testing_utils.cpp b/experimental/builder/test/testing_utils.cpp index c99d56ef56..34793b601e 100644 --- a/experimental/builder/test/testing_utils.cpp +++ b/experimental/builder/test/testing_utils.cpp @@ -1,21 +1,18 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. -#include -#include -#include -#include -#include +#include "testing_utils.hpp" #include #include -#include "testing_utils.hpp" +#include +#include +#include +#include +#include +#include namespace ck_tile::test { -namespace { - -} // namespace - // Wagner-Fischer Algorithm for Computing Edit Distance and Inline Diff // // OUTPUT FORMAT: [expected|actual] for differences, plain text for matches @@ -216,4 +213,88 @@ void StringEqWithDiffMatcher::DescribeNegationTo(std::ostream* os) const return ::testing::MakeMatcher(new StringEqWithDiffMatcher(expected)); } +std::ostream& operator<<(std::ostream& os, const InstanceSet& set) +{ + // These sets can grow very large, and so its not very nice or useful to print them + // in the event of a mismatch. Just print a brief description here, and use + // InstancesMatcher to print a more useful message. + return (os << "(set of " << set.instances.size() << " instances)"); +} + +InstanceMatcher::InstanceMatcher(const InstanceSet& expected) : expected_(expected) {} + +::testing::Matcher InstancesMatch(const InstanceSet& expected) +{ + return ::testing::MakeMatcher(new InstanceMatcher(expected)); +} + +bool InstanceMatcher::MatchAndExplain(InstanceSet actual, + ::testing::MatchResultListener* listener) const +{ + if(actual.instances == expected_.instances) + { + return true; + } + + if(listener->IsInterested()) + { + std::vector instances; + std::set_difference(expected_.instances.begin(), + expected_.instances.end(), + actual.instances.begin(), + actual.instances.end(), + std::back_inserter(instances)); + + *listener << "\n"; + + if(instances.size() > 0) + { + *listener << " Missing: " << instances.size() << "\n"; + for(const auto& instance : instances) + { + if(instance == "") + { + *listener << "- (empty string)\n"; + } + else + { + *listener << "- " << instance << "\n"; + } + } + } + + instances.clear(); + std::set_difference(actual.instances.begin(), + actual.instances.end(), + expected_.instances.begin(), + expected_.instances.end(), + std::back_inserter(instances)); + + if(instances.size() > 0) + { + *listener << "Unexpected: " << instances.size() << "\n"; + for(const auto& instance : instances) + { + if(instance == "") + { + *listener << "- (empty string)\n"; + } + else + { + *listener << "- " << instance << "\n"; + } + } + } + } + + return false; +} + +void InstanceMatcher::DescribeTo(std::ostream* os) const { *os << expected_; } + +void InstanceMatcher::DescribeNegationTo(std::ostream* os) const +{ + *os << "is not equal to " << expected_; +} + } // namespace ck_tile::test diff --git a/experimental/builder/test/testing_utils.hpp b/experimental/builder/test/testing_utils.hpp index 3e8772a080..3ff2eb32de 100644 --- a/experimental/builder/test/testing_utils.hpp +++ b/experimental/builder/test/testing_utils.hpp @@ -1,10 +1,15 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. +#include #include #include #include #include +#include +#include +#include +#include namespace ck_tile::test { @@ -40,4 +45,68 @@ class StringEqWithDiffMatcher : public ::testing::MatcherInterface // Factory function for the StringEqWithDiff matcher ::testing::Matcher StringEqWithDiff(const std::string& expected); +using ck::tensor_operation::device::instance::DeviceOperationInstanceFactory; + +// This utility concept checks whether a type is a valid "Device Operation" - +// that is, there is a valid specialization of `DeviceOperationInstanceFactory` +// for it available. +template +concept HasCkFactory = requires { + { + DeviceOperationInstanceFactory::GetInstances() + } -> std::convertible_to>>; +}; + +// This structure represents a (unique) set of instances, either a statically +// defined one (for testing) or one obtained from DeviceOperationInstanceFactory. +// The idea is that we use this structure as a utility to compare a set of +// instances. Instances are stored in a set so that they can be lexicographically +// compared, this helps generating readable error messages which just contain +// the differenses between sets. +struct InstanceSet +{ + explicit InstanceSet() {} + + explicit InstanceSet(std::initializer_list items) + : instances(items.begin(), items.end()) + { + } + + template + static InstanceSet from_factory() + { + auto set = InstanceSet(); + + const auto ops = DeviceOperationInstanceFactory::GetInstances(); + for(const auto& op : ops) + { + set.instances.insert(op->GetInstanceString()); + } + + return set; + } + + std::set instances; +}; + +std::ostream& operator<<(std::ostream& os, const InstanceSet& set); + +// This is a custom Google Test matcher which can be used to compare two sets +// of instance names, with utility functions that print a helpful error +// message about the difference between the checked sets. Use `InstancesMatch` +// to obtain an instance of this type. +struct InstanceMatcher : public ::testing::MatcherInterface +{ + explicit InstanceMatcher(const InstanceSet& expected); + + bool MatchAndExplain(InstanceSet actual, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + void DescribeNegationTo(std::ostream* os) const override; + + InstanceSet expected_; +}; + +::testing::Matcher InstancesMatch(const InstanceSet& expected); + } // namespace ck_tile::test diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp index ea871f3325..b086a8fbab 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp @@ -300,7 +300,7 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp index 1c873863b3..588bb554ad 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp @@ -7,7 +7,7 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dynamic.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" @@ -161,7 +161,7 @@ template + typename BComputeType> struct DeviceOperationInstanceFactory>>& instances); -void add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances( +void add_device_grouped_conv3d_fwd_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_tf32_instances( std::vector