From d15334ed0d7a311abdf47c748e7d2ca789d71a18 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Fri, 24 Oct 2025 14:21:54 +0200 Subject: [PATCH] ck-builder: ck factory grouped conv fwd --- experimental/builder/test/CMakeLists.txt | 2 + ...ck_factory_grouped_convolution_forward.cpp | 2345 +++++++++++++++++ experimental/builder/test/testing_utils.cpp | 12 +- .../gpu/grouped_convolution_forward.hpp | 2 +- 4 files changed, 2357 insertions(+), 4 deletions(-) create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 3c5580e693..d0703ede57 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -31,3 +31,5 @@ function(add_ck_factory_test test_name) endfunction() add_ck_factory_test(test_testing_utils test_testing_utils.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward test_ck_factory_grouped_convolution_forward.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_convscale test_ck_factory_grouped_convolution_forward_convscale.cpp) diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp new file mode 100644 index 0000000000..53199f1d77 --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp @@ -0,0 +1,2345 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +using namespace ck::tensor_layout::convolution; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::PassThrough; + +template +using DeviceOp = DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + PassThrough, + AComputeType, + BComputeType>; + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +// 1D kernels - layout GNWC/GKXC/GNWK + +struct F32_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_1D_GNWC_GKXC_GNWK +{ + using DeviceOp = ::DeviceOp<1, GNWC, GKXC, GNWK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 2D kernels - layout GNHWC/GKYXC/GNHWK + +struct F32_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_2D_GNHWC_GKYXC_GNHWK +{ + using DeviceOp = ::DeviceOp<2, GNHWC, GKYXC, GNHWK, int8_t>; + + constexpr static std::initializer_list expected = { + // clang-format off + // clang-format on + }; +}; + +// 2D kernels - layout NHWGC/GKYXC/NHWGK + +struct F32_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, float>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "" + // clang-format on + }; +}; + +struct F16_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "" + // clang-format on + }; +}; + +struct BF16_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "" + // clang-format on + }; +}; + +struct S8_2D_NHWGC_GKYXC_NHWGK +{ + using DeviceOp = ::DeviceOp<2, NHWGC, GKYXC, NHWGK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", + "" + // clang-format on + }; +}; + +// 2D kernels - layout NGCHW/GKCYX/NGKHW + +struct F32_2D_NGCHW_GKCYX_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKCYX, NGKHW, float>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "" + // clang-format on + }; +}; + +struct F16_2D_NGCHW_GKCYX_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKCYX, NGKHW, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "" + // clang-format on + }; +}; + +struct BF16_2DNGCHW_GKCYX_NGKHWK +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKCYX, NGKHW, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "" + // clang-format on + }; +}; + +// 2D kernels - layout NGCHW/GKYXC/NGKHW + +struct F32_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_2D_NGCHW_GKYXC_NGKHW +{ + using DeviceOp = ::DeviceOp<2, NGCHW, GKYXC, NGKHW, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 3D - layout GNDHWC/GKZYXC/GNDHWK + +struct F32_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, float>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF16_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct S8_3D_GNDHWC_GKZYXC_GNDHWK +{ + using DeviceOp = ::DeviceOp<3, GNDHWC, GKZYXC, GNDHWK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 3D - layout NDHWGC/GKZYXC/NDHWGK + +struct F32_3D_NDHWGC_GKZYXC_NDHWGK_TF32 +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, float, float, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F32_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, float>; + + constexpr static auto expected = { + // clang-format off + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "" + // clang-format on + }; +}; + +struct F16_3D_NDHWGC_GKZYXC_NDHWGK_F8 +{ + using DeviceOp = + ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::half_t, ck::half_t, ck::half_t, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::bf8_t, ck::bf8_t, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F8_BF8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = + ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::f8_t, ck::bf8_t, ck::f8_t, ck::f8_t, ck::bf8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct BF8_F8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = + ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::bf8_t, ck::f8_t, ck::f8_t, ck::bf8_t, ck::f8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +struct F16_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +struct S8_3D_NDHWGC_GKZYXC_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NDHWGC, GKZYXC, NDHWGK, int8_t>; + + constexpr static auto expected = { + // clang-format off + "" + // clang-format on + }; +}; + +// 3D - layout NGCDHW/GKCZYX/NGKDHW + +struct F32_3D_NGCDHW_GKCZYX_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NGCDHW, GKCZYX, NGKDHW, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F16_3D_NGCDHW_GKCZYX_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NGCDHW, GKCZYX, NGKDHW, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_3D_NGCDHW_GKCZYX_NDHWGK +{ + using DeviceOp = ::DeviceOp<3, NGCDHW, GKCZYX, NGKDHW, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +} diff --git a/experimental/builder/test/testing_utils.cpp b/experimental/builder/test/testing_utils.cpp index 51a4072e34..34793b601e 100644 --- a/experimental/builder/test/testing_utils.cpp +++ b/experimental/builder/test/testing_utils.cpp @@ -252,7 +252,14 @@ bool InstanceMatcher::MatchAndExplain(InstanceSet actual, *listener << " Missing: " << instances.size() << "\n"; for(const auto& instance : instances) { - *listener << "- " << instance << "\n"; + if(instance == "") + { + *listener << "- (empty string)\n"; + } + else + { + *listener << "- " << instance << "\n"; + } } } @@ -270,8 +277,7 @@ bool InstanceMatcher::MatchAndExplain(InstanceSet actual, { if(instance == "") { - *listener << "- (empty string; indicates missing GetInstanceString() overload)" - << "\n"; + *listener << "- (empty string)\n"; } else { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp index ea871f3325..b086a8fbab 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp @@ -300,7 +300,7 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) {