From 2a309d7534ec55204efefad069bb3a51eb424120 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Mon, 27 Oct 2025 15:09:46 +0100 Subject: [PATCH] ck-builder: ck factory grouped conv fwd bias clamp --- experimental/builder/test/CMakeLists.txt | 1 + ...grouped_convolution_forward_bias_clamp.cpp | 1269 +++++++++++++++++ 2 files changed, 1270 insertions(+) create mode 100644 experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 3f9f11fcf6..d1f439682d 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -36,3 +36,4 @@ add_ck_factory_test(test_ck_factory_grouped_convolution_forward test_ck_factory_ add_ck_factory_test(test_ck_factory_grouped_convolution_forward_convscale test_ck_factory_grouped_convolution_forward_convscale.cpp) add_ck_factory_test(test_ck_factory_grouped_convolution_forward_scale test_ck_factory_grouped_convolution_forward_scale.cpp) add_ck_factory_test(test_ck_factory_grouped_convolution_forward_scaleadd_ab test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp) +add_ck_factory_test(test_ck_factory_grouped_convolution_forward_bias_clamp test_ck_factory_grouped_convolution_forward_bias_clamp.cpp) diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp new file mode 100644 index 0000000000..a9e5e39e8f --- /dev/null +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp @@ -0,0 +1,1269 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "testing_utils.hpp" + +using ck_tile::test::InstanceSet; +using ck_tile::test::InstancesMatch; + +namespace { + +using InLayout = ck::tensor_layout::convolution::NHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::NHWGK; + +using ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; +using ck::tensor_operation::element_wise::AddClamp; +using ck::tensor_operation::element_wise::PassThrough; + +template +struct DeviceOpHelper; + +template +struct DeviceOpHelper<2, T, ComputeType> +{ + using InLayout = ck::tensor_layout::convolution::NHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKYXC; + using OutLayout = ck::tensor_layout::convolution::NHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<2, + InLayout, + WeiLayout, + ck::Tuple, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + AddClamp, + ComputeType>; +}; + +template +struct DeviceOpHelper<3, T, ComputeType> +{ + using InLayout = ck::tensor_layout::convolution::NDHWGC; + using WeiLayout = ck::tensor_layout::convolution::GKZYXC; + using OutLayout = ck::tensor_layout::convolution::NDHWGK; + + using Type = DeviceGroupedConvFwdMultipleABD<3, + InLayout, + WeiLayout, + ck::Tuple, // DsLayout + OutLayout, + T, // InDataType + T, // WeiDataType + ck::Tuple, // DsDataType + T, // OutDataType + PassThrough, + PassThrough, + AddClamp, + ComputeType>; +}; + +template +using DeviceOp = DeviceOpHelper::Type; + +} // namespace + +template +struct CkFactoryTestConvFwd : public testing::Test +{ + static auto get_actual_instances() + { + return InstanceSet::from_factory(); + } + + static auto get_expected_instances() { return InstanceSet(Case::expected); } +}; + +struct F32_2D +{ + using DeviceOp = ::DeviceOp<2, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_TF32_2D +{ + using DeviceOp = ::DeviceOp<2, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_2D +{ + using DeviceOp = ::DeviceOp<2, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +struct F32_3D +{ + using DeviceOp = ::DeviceOp<3, float>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + // clang-format on + }; +}; + +struct F32_TF32_3D +{ + using DeviceOp = ::DeviceOp<3, float, ck::tf32_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + // clang-format on + }; +}; + +struct F16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::half_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + // clang-format on + }; +}; + +struct BF16_3D +{ + using DeviceOp = ::DeviceOp<3, ck::bhalf_t>; + + constexpr static auto expected = { + // clang-format off + "", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + // clang-format on + }; +}; + +using TestTypes = + ::testing::Types; + +TYPED_TEST_SUITE(CkFactoryTestConvFwd, TestTypes); + +TYPED_TEST(CkFactoryTestConvFwd, TestInstances) +{ + auto actual = TestFixture::get_actual_instances(); + auto expected = TestFixture::get_expected_instances(); + + EXPECT_THAT(actual, InstancesMatch(expected)); +}